Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 723 results for author: <span class="mathjax">Han, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Han%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Han, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Han%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Han, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07598">arXiv:2503.07598</a> <span> [<a href="https://arxiv.org/pdf/2503.07598">pdf</a>, <a href="https://arxiv.org/format/2503.07598">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VACE: All-in-One Video Creation and Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zeyinzi Jiang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhen Han</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+C">Chaojie Mao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yulin Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07598v1-abstract-short" style="display: inline;"> Diffusion Transformer has demonstrated powerful capability and scalability in generating high-quality images and videos. Further pursuing the unification of generation and editing tasks has yielded significant progress in the domain of image content creation. However, due to the intrinsic demands for consistency across both temporal and spatial dynamics, achieving a unified approach for video synt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07598v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07598v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07598v1-abstract-full" style="display: none;"> Diffusion Transformer has demonstrated powerful capability and scalability in generating high-quality images and videos. Further pursuing the unification of generation and editing tasks has yielded significant progress in the domain of image content creation. However, due to the intrinsic demands for consistency across both temporal and spatial dynamics, achieving a unified approach for video synthesis remains challenging. We introduce VACE, which enables users to perform Video tasks within an All-in-one framework for Creation and Editing. These tasks include reference-to-video generation, video-to-video editing, and masked video-to-video editing. Specifically, we effectively integrate the requirements of various tasks by organizing video task inputs, such as editing, reference, and masking, into a unified interface referred to as the Video Condition Unit (VCU). Furthermore, by utilizing a Context Adapter structure, we inject different task concepts into the model using formalized representations of temporal and spatial dimensions, allowing it to handle arbitrary video synthesis tasks flexibly. Extensive experiments demonstrate that the unified model of VACE achieves performance on par with task-specific models across various subtasks. Simultaneously, it enables diverse applications through versatile task combinations. Project page: https://ali-vilab.github.io/VACE-Page/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07598v1-abstract-full').style.display = 'none'; document.getElementById('2503.07598v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07334">arXiv:2503.07334</a> <span> [<a href="https://arxiv.org/pdf/2503.07334">pdf</a>, <a href="https://arxiv.org/format/2503.07334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Potential of Large Language Models for Text-to-Image Generation through Autoregressive Representation Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xing Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiawei Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Ziyue Lin</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Huijie Fan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhi Han</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yandong Tang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liangqiong Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07334v1-abstract-short" style="display: inline;"> We present Autoregressive Representation Alignment (ARRA), a new training framework that unlocks global-coherent text-to-image generation in autoregressive LLMs without architectural changes. Unlike prior work that requires complex architectural redesigns, ARRA aligns LLM hidden states with visual representations from external visual foundational models via a global visual alignment loss and a hyb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07334v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07334v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07334v1-abstract-full" style="display: none;"> We present Autoregressive Representation Alignment (ARRA), a new training framework that unlocks global-coherent text-to-image generation in autoregressive LLMs without architectural changes. Unlike prior work that requires complex architectural redesigns, ARRA aligns LLM hidden states with visual representations from external visual foundational models via a global visual alignment loss and a hybrid token, <HYBNEXT>. This token enforces dual constraints: local next-token prediction and global semantic distillation, enabling LLMs to implicitly learn spatial and contextual coherence while retaining their original autoregressive paradigm. Extensive experiments validate ARRA's plug-and-play versatility. When training from text-generation-only LLMs or random initialization, ARRA reduces FID by 25.5% (MIMIC-CXR), 8.8% (DeepEyeNet), and 7.5% (ImageNet) for advanced autoregressive LLMs like Chameleon and LlamaGen, all without framework modifications. For domain adaption, ARRA aligns general-purpose LLMs with specialized models (e.g., BioMedCLIP), achieving an 18.6% FID reduction over direct fine-tuning on medical imaging (MIMIC-CXR). By demonstrating that training objective redesign -- not just architectural innovation -- can resolve cross-modal global coherence challenges, ARRA offers a complementary paradigm for advancing autoregressive models. Code and models will be released to advance autoregressive image generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07334v1-abstract-full').style.display = 'none'; document.getElementById('2503.07334v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04184">arXiv:2503.04184</a> <span> [<a href="https://arxiv.org/pdf/2503.04184">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large-Scale AI in Telecom: Charting the Roadmap for Innovation, Scalability, and Enhanced Digital Experiences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shahid%2C+A">Adnan Shahid</a>, <a href="/search/cs?searchtype=author&query=Kliks%2C+A">Adrian Kliks</a>, <a href="/search/cs?searchtype=author&query=Al-Tahmeesschi%2C+A">Ahmed Al-Tahmeesschi</a>, <a href="/search/cs?searchtype=author&query=Elbakary%2C+A">Ahmed Elbakary</a>, <a href="/search/cs?searchtype=author&query=Nikou%2C+A">Alexandros Nikou</a>, <a href="/search/cs?searchtype=author&query=Maatouk%2C+A">Ali Maatouk</a>, <a href="/search/cs?searchtype=author&query=Mokh%2C+A">Ali Mokh</a>, <a href="/search/cs?searchtype=author&query=Kazemi%2C+A">Amirreza Kazemi</a>, <a href="/search/cs?searchtype=author&query=De+Domenico%2C+A">Antonio De Domenico</a>, <a href="/search/cs?searchtype=author&query=Karapantelakis%2C+A">Athanasios Karapantelakis</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bo Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bohao Wang</a>, <a href="/search/cs?searchtype=author&query=Fischione%2C+C">Carlo Fischione</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Issaid%2C+C+B">Chaouki Ben Issaid</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chenghui Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chongwen Huang</a>, <a href="/search/cs?searchtype=author&query=Chaccour%2C+C">Christina Chaccour</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+C+K">Christo Kurisummoottil Thomas</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+D">Dheeraj Sharma</a>, <a href="/search/cs?searchtype=author&query=Kalogiros%2C+D">Dimitris Kalogiros</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=De+Poorter%2C+E">Eli De Poorter</a> , et al. (110 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04184v1-abstract-short" style="display: inline;"> This white paper discusses the role of large-scale AI in the telecommunications industry, with a specific focus on the potential of generative AI to revolutionize network functions and user experiences, especially in the context of 6G systems. It highlights the development and deployment of Large Telecom Models (LTMs), which are tailored AI models designed to address the complex challenges faced b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04184v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04184v1-abstract-full" style="display: none;"> This white paper discusses the role of large-scale AI in the telecommunications industry, with a specific focus on the potential of generative AI to revolutionize network functions and user experiences, especially in the context of 6G systems. It highlights the development and deployment of Large Telecom Models (LTMs), which are tailored AI models designed to address the complex challenges faced by modern telecom networks. The paper covers a wide range of topics, from the architecture and deployment strategies of LTMs to their applications in network management, resource allocation, and optimization. It also explores the regulatory, ethical, and standardization considerations for LTMs, offering insights into their future integration into telecom infrastructure. The goal is to provide a comprehensive roadmap for the adoption of LTMs to enhance scalability, performance, and user-centric innovation in telecom networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04184v1-abstract-full').style.display = 'none'; document.getElementById('2503.04184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02412">arXiv:2503.02412</a> <span> [<a href="https://arxiv.org/pdf/2503.02412">pdf</a>, <a href="https://arxiv.org/format/2503.02412">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SEB-Naver: A SE(2)-based Local Navigation Framework for Car-like Robots on Uneven Terrain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoying Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Long Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaolin Huang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+D">Donglai Xue</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhichao Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yanjun Cao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+F">Fei Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02412v2-abstract-short" style="display: inline;"> Autonomous navigation of car-like robots on uneven terrain poses unique challenges compared to flat terrain, particularly in traversability assessment and terrain-associated kinematic modelling for motion planning. This paper introduces SEB-Naver, a novel SE(2)-based local navigation framework designed to overcome these challenges. First, we propose an efficient traversability assessment method fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02412v2-abstract-full').style.display = 'inline'; document.getElementById('2503.02412v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02412v2-abstract-full" style="display: none;"> Autonomous navigation of car-like robots on uneven terrain poses unique challenges compared to flat terrain, particularly in traversability assessment and terrain-associated kinematic modelling for motion planning. This paper introduces SEB-Naver, a novel SE(2)-based local navigation framework designed to overcome these challenges. First, we propose an efficient traversability assessment method for SE(2) grids, leveraging GPU parallel computing to enable real-time updates and maintenance of local maps. Second, inspired by differential flatness, we present an optimization-based trajectory planning method that integrates terrain-associated kinematic models, significantly improving both planning efficiency and trajectory quality. Finally, we unify these components into SEB-Naver, achieving real-time terrain assessment and trajectory optimization. Extensive simulations and real-world experiments demonstrate the effectiveness and efficiency of our approach. The code is at https://github.com/ZJU-FAST-Lab/seb_naver. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02412v2-abstract-full').style.display = 'none'; document.getElementById('2503.02412v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00785">arXiv:2503.00785</a> <span> [<a href="https://arxiv.org/pdf/2503.00785">pdf</a>, <a href="https://arxiv.org/format/2503.00785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FLOAT Drone: A Fully-actuated Coaxial Aerial Robot for Close-Proximity Operations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junxiao Lin</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shuhang Ji</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuze Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianyue Wu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhichao Han</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+F">Fei Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00785v1-abstract-short" style="display: inline;"> How to endow aerial robots with the ability to operate in close proximity remains an open problem. The core challenges lie in the propulsion system's dual-task requirement: generating manipulation forces while simultaneously counteracting gravity. These competing demands create dynamic coupling effects during physical interactions. Furthermore, rotor-induced airflow disturbances critically undermi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00785v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00785v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00785v1-abstract-full" style="display: none;"> How to endow aerial robots with the ability to operate in close proximity remains an open problem. The core challenges lie in the propulsion system's dual-task requirement: generating manipulation forces while simultaneously counteracting gravity. These competing demands create dynamic coupling effects during physical interactions. Furthermore, rotor-induced airflow disturbances critically undermine operational reliability. Although fully-actuated unmanned aerial vehicles (UAVs) alleviate dynamic coupling effects via six-degree-of-freedom (6-DoF) force-torque decoupling, existing implementations fail to address the aerodynamic interference between drones and environments. They also suffer from oversized designs, which compromise maneuverability and limit their applications in various operational scenarios. To address these limitations, we present FLOAT Drone (FuLly-actuated cOaxial Aerial roboT), a novel fully-actuated UAV featuring two key structural innovations. By integrating control surfaces into fully-actuated systems for the first time, we significantly suppress lateral airflow disturbances during operations. Furthermore, a coaxial dual-rotor configuration enables a compact size while maintaining high hovering efficiency. Through dynamic modeling, we have developed hierarchical position and attitude controllers that support both fully-actuated and underactuated modes. Experimental validation through comprehensive real-world experiments confirms the system's functional capabilities in close-proximity operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00785v1-abstract-full').style.display = 'none'; document.getElementById('2503.00785v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00476">arXiv:2503.00476</a> <span> [<a href="https://arxiv.org/pdf/2503.00476">pdf</a>, <a href="https://arxiv.org/format/2503.00476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> G-OSR: A Comprehensive Benchmark for Graph Open-Set Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yicong Dong</a>, <a href="/search/cs?searchtype=author&query=He%2C+R">Rundong He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guangyao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhongyi Han</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jieming Shi</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yilong Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00476v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have achieved significant success in machine learning, with wide applications in social networks, bioinformatics, knowledge graphs, and other fields. Most research assumes ideal closed-set environments. However, in real-world open-set environments, graph learning models face challenges in robustness and reliability due to unseen classes. This highlights the need for Gr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00476v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00476v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00476v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have achieved significant success in machine learning, with wide applications in social networks, bioinformatics, knowledge graphs, and other fields. Most research assumes ideal closed-set environments. However, in real-world open-set environments, graph learning models face challenges in robustness and reliability due to unseen classes. This highlights the need for Graph Open-Set Recognition (GOSR) methods to address these issues and ensure effective GNN application in practical scenarios. Research in GOSR is in its early stages, with a lack of a comprehensive benchmark spanning diverse tasks and datasets to evaluate methods. Moreover, traditional methods, Graph Out-of-Distribution Detection (GOODD), GOSR, and Graph Anomaly Detection (GAD) have mostly evolved in isolation, with little exploration of their interconnections or potential applications to GOSR. To fill these gaps, we introduce \textbf{G-OSR}, a comprehensive benchmark for evaluating GOSR methods at both the node and graph levels, using datasets from multiple domains to ensure fair and standardized comparisons of effectiveness and efficiency across traditional, GOODD, GOSR, and GAD methods. The results offer critical insights into the generalizability and limitations of current GOSR methods and provide valuable resources for advancing research in this field through systematic analysis of diverse approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00476v1-abstract-full').style.display = 'none'; document.getElementById('2503.00476v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages,2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20687">arXiv:2502.20687</a> <span> [<a href="https://arxiv.org/pdf/2502.20687">pdf</a>, <a href="https://arxiv.org/format/2502.20687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Potential of Two-Tower Models: Diffusion-Based Cross-Interaction for Large-Scale Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yihan Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+F">Fei Xiong</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhexin Han</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Q">Qi Song</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+K">Kaiqiao Zhan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Ben Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20687v1-abstract-short" style="display: inline;"> Two-tower models are widely adopted in the industrial-scale matching stage across a broad range of application domains, such as content recommendations, advertisement systems, and search engines. This model efficiently handles large-scale candidate item screening by separating user and item representations. However, the decoupling network also leads to a neglect of potential information interactio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20687v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20687v1-abstract-full" style="display: none;"> Two-tower models are widely adopted in the industrial-scale matching stage across a broad range of application domains, such as content recommendations, advertisement systems, and search engines. This model efficiently handles large-scale candidate item screening by separating user and item representations. However, the decoupling network also leads to a neglect of potential information interaction between the user and item representations. Current state-of-the-art (SOTA) approaches include adding a shallow fully connected layer(i.e., COLD), which is limited by performance and can only be used in the ranking stage. For performance considerations, another approach attempts to capture historical positive interaction information from the other tower by regarding them as the input features(i.e., DAT). Later research showed that the gains achieved by this method are still limited because of lacking the guidance on the next user intent. To address the aforementioned challenges, we propose a "cross-interaction decoupling architecture" within our matching paradigm. This user-tower architecture leverages a diffusion module to reconstruct the next positive intention representation and employs a mixed-attention module to facilitate comprehensive cross-interaction. During the next positive intention generation, we further enhance the accuracy of its reconstruction by explicitly extracting the temporal drift within user behavior sequences. Experiments on two real-world datasets and one industrial dataset demonstrate that our method outperforms the SOTA two-tower models significantly, and our diffusion approach outperforms other generative models in reconstructing item representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20687v1-abstract-full').style.display = 'none'; document.getElementById('2502.20687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19832">arXiv:2502.19832</a> <span> [<a href="https://arxiv.org/pdf/2502.19832">pdf</a>, <a href="https://arxiv.org/format/2502.19832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Tracailer: An Efficient Trajectory Planner for Tractor-Trailer Vehicles in Unstructured Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+L">Long Xu</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+K">Kaixin Chai</a>, <a href="/search/cs?searchtype=author&query=An%2C+B">Boyuan An</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+J">Jiaxiang Gan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianhao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoying Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junxiao Lin</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhichao Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yanjun Cao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+F">Fei Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19832v1-abstract-short" style="display: inline;"> The tractor-trailer vehicle (robot) consists of a drivable tractor and one or more non-drivable trailers connected via hitches. Compared to typical car-like robots, the addition of trailers provides greater transportation capability. However, this also complicates motion planning due to the robot's complex kinematics, high-dimensional state space, and deformable structure. To efficiently plan safe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19832v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19832v1-abstract-full" style="display: none;"> The tractor-trailer vehicle (robot) consists of a drivable tractor and one or more non-drivable trailers connected via hitches. Compared to typical car-like robots, the addition of trailers provides greater transportation capability. However, this also complicates motion planning due to the robot's complex kinematics, high-dimensional state space, and deformable structure. To efficiently plan safe, time-optimal trajectories that adhere to the kinematic constraints of the robot and address the challenges posed by its unique features, this paper introduces a lightweight, compact, and high-order smooth trajectory representation for tractor-trailer robots. Based on it, we design an efficiently solvable spatio-temporal trajectory optimization problem. To deal with deformable structures, which leads to difficulties in collision avoidance, we fully leverage the collision-free regions of the environment, directly applying deformations to trajectories in continuous space. This approach not requires constructing safe regions from the environment using convex approximations through collision-free seed points before each optimization, avoiding the loss of the solution space, thus reducing the dependency of the optimization on initial values. Moreover, a multi-terminal fast path search algorithm is proposed to generate the initial values for optimization. Extensive simulation experiments demonstrate that our approach achieves several-fold improvements in efficiency compared to existing algorithms, while also ensuring lower curvature and trajectory duration. Real-world experiments involving the transportation, loading and unloading of goods in both indoor and outdoor scenarios further validate the effectiveness of our method. The source code is accessible at https://github.com/ZJU-FAST-Lab/tracailer/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19832v1-abstract-full').style.display = 'none'; document.getElementById('2502.19832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19200">arXiv:2502.19200</a> <span> [<a href="https://arxiv.org/pdf/2502.19200">pdf</a>, <a href="https://arxiv.org/format/2502.19200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HDM: Hybrid Diffusion Model for Unified Image Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weng%2C+Z">Zekang Weng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jinjin Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinwei Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zeming Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19200v1-abstract-short" style="display: inline;"> Image anomaly detection plays a vital role in applications such as industrial quality inspection and medical imaging, where it directly contributes to improving product quality and system reliability. However, existing methods often struggle with complex and diverse anomaly patterns. In particular, the separation between generation and discrimination tasks limits the effective coordination between… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19200v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19200v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19200v1-abstract-full" style="display: none;"> Image anomaly detection plays a vital role in applications such as industrial quality inspection and medical imaging, where it directly contributes to improving product quality and system reliability. However, existing methods often struggle with complex and diverse anomaly patterns. In particular, the separation between generation and discrimination tasks limits the effective coordination between anomaly sample generation and anomaly region detection. To address these challenges, we propose a novel hybrid diffusion model (HDM) that integrates generation and discrimination into a unified framework. The model consists of three key modules: the Diffusion Anomaly Generation Module (DAGM), the Diffusion Discriminative Module (DDM), and the Probability Optimization Module (POM). DAGM generates realistic and diverse anomaly samples, improving their representativeness. DDM then applies a reverse diffusion process to capture the differences between generated and normal samples, enabling precise anomaly region detection and localization based on probability distributions. POM refines the probability distributions during both the generation and discrimination phases, ensuring high-quality samples are used for training. Extensive experiments on multiple industrial image datasets demonstrate that our method outperforms state-of-the-art approaches, significantly improving both image-level and pixel-level anomaly detection performance, as measured by AUROC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19200v1-abstract-full').style.display = 'none'; document.getElementById('2502.19200v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17829">arXiv:2502.17829</a> <span> [<a href="https://arxiv.org/pdf/2502.17829">pdf</a>, <a href="https://arxiv.org/format/2502.17829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Silent Speech Sentence Recognition with Six-Axis Accelerometers using Conformer and CTC Algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yudong Xie</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhifeng Han</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Q">Qinfan Xiao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Liwei Liang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+L">Lu-Qi Tao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+T">Tian-Ling Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17829v1-abstract-short" style="display: inline;"> Silent speech interfaces (SSI) are being actively developed to assist individuals with communication impairments who have long suffered from daily hardships and a reduced quality of life. However, silent sentences are difficult to segment and recognize due to elision and linking. A novel silent speech sentence recognition method is proposed to convert the facial motion signals collected by six-axi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17829v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17829v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17829v1-abstract-full" style="display: none;"> Silent speech interfaces (SSI) are being actively developed to assist individuals with communication impairments who have long suffered from daily hardships and a reduced quality of life. However, silent sentences are difficult to segment and recognize due to elision and linking. A novel silent speech sentence recognition method is proposed to convert the facial motion signals collected by six-axis accelerometers into transcribed words and sentences. A Conformer-based neural network with the Connectionist-Temporal-Classification algorithm is used to gain contextual understanding and translate the non-acoustic signals into words sequences, solely requesting the constituent words in the database. Test results show that the proposed method achieves a 97.17% accuracy in sentence recognition, surpassing the existing silent speech recognition methods with a typical accuracy of 85%-95%, and demonstrating the potential of accelerometers as an available SSI modality for high-accuracy silent speech sentence recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17829v1-abstract-full').style.display = 'none'; document.getElementById('2502.17829v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16866">arXiv:2502.16866</a> <span> [<a href="https://arxiv.org/pdf/2502.16866">pdf</a>, <a href="https://arxiv.org/format/2502.16866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Toward Agentic AI: Generative Information Retrieval Inspired Intelligent Communications and Networking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shunpu Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yinqiu Liu</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Sumei Sun</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+S">Shiwen Mao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16866v1-abstract-short" style="display: inline;"> The increasing complexity and scale of modern telecommunications networks demand intelligent automation to enhance efficiency, adaptability, and resilience. Agentic AI has emerged as a key paradigm for intelligent communications and networking, enabling AI-driven agents to perceive, reason, decide, and act within dynamic networking environments. However, effective decision-making in telecom applic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16866v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16866v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16866v1-abstract-full" style="display: none;"> The increasing complexity and scale of modern telecommunications networks demand intelligent automation to enhance efficiency, adaptability, and resilience. Agentic AI has emerged as a key paradigm for intelligent communications and networking, enabling AI-driven agents to perceive, reason, decide, and act within dynamic networking environments. However, effective decision-making in telecom applications, such as network planning, management, and resource allocation, requires integrating retrieval mechanisms that support multi-hop reasoning, historical cross-referencing, and compliance with evolving 3GPP standards. This article presents a forward-looking perspective on generative information retrieval-inspired intelligent communications and networking, emphasizing the role of knowledge acquisition, processing, and retrieval in agentic AI for telecom systems. We first provide a comprehensive review of generative information retrieval strategies, including traditional retrieval, hybrid retrieval, semantic retrieval, knowledge-based retrieval, and agentic contextual retrieval. We then analyze their advantages, limitations, and suitability for various networking scenarios. Next, we present a survey about their applications in communications and networking. Additionally, we introduce an agentic contextual retrieval framework to enhance telecom-specific planning by integrating multi-source retrieval, structured reasoning, and self-reflective validation. Experimental results demonstrate that our framework significantly improves answer accuracy, explanation consistency, and retrieval efficiency compared to traditional and semantic retrieval methods. Finally, we outline future research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16866v1-abstract-full').style.display = 'none'; document.getElementById('2502.16866v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16468">arXiv:2502.16468</a> <span> [<a href="https://arxiv.org/pdf/2502.16468">pdf</a>, <a href="https://arxiv.org/format/2502.16468">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> A Contemporary Survey on Semantic Communications:Theory of Mind, Generative AI, and Deep Joint Source-Channel Coding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+L+X">Loc X. Nguyen</a>, <a href="/search/cs?searchtype=author&query=Raha%2C+A+D">Avi Deb Raha</a>, <a href="/search/cs?searchtype=author&query=Aung%2C+P+S">Pyae Sone Aung</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C+S">Choong Seon Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16468v1-abstract-short" style="display: inline;"> Semantic Communication is becoming the next pillar in wireless communication technology due to its various capabilities. However, it still encounters various challenging obstacles that need to be solved before real-world deployment. The major challenge is the lack of standardization across different directions, leading to variations in interpretations and objectives. In the survey, we provide deta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16468v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16468v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16468v1-abstract-full" style="display: none;"> Semantic Communication is becoming the next pillar in wireless communication technology due to its various capabilities. However, it still encounters various challenging obstacles that need to be solved before real-world deployment. The major challenge is the lack of standardization across different directions, leading to variations in interpretations and objectives. In the survey, we provide detailed explanations of three leading directions in semantic communications, namely Theory of Mind, Generative AI, Deep Joint Source-Channel Coding. These directions have been widely studied, developed, and verified by institutes worldwide, and their effectiveness has increased along with the advancement in technology. We first introduce the concepts and background of these directions. Firstly, we introduce the Theory of Mind, where the communication agents interact with each other, gaining understanding from observations and slowly forming a common language. Secondly, we present generative AI models, which can create new content and offer more freedom to interpret the data beyond the limitation of semantic meaning compression of raw data before transmitting it. The received signal is then decoded by another generative AI model to execute the oriented task. Thirdly, we review deep learning models to jointly optimize the source and channel coding modules. Then, we present a comprehensive survey of existing works in each direction, thereby offering readers an overview of past achievements and potential avenues for further contribution. Moreover, for each direction, we identify and discuss the existing challenges that must be addressed before these approaches can be effectively deployed in real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16468v1-abstract-full').style.display = 'none'; document.getElementById('2502.16468v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages, 9 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16363">arXiv:2502.16363</a> <span> [<a href="https://arxiv.org/pdf/2502.16363">pdf</a>, <a href="https://arxiv.org/format/2502.16363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Multi-Party Data Pricing for Complex Data Trading Markets: A Rubinstein Bargaining Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mi%2C+B">Bing Mi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhengwang Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kongyang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16363v1-abstract-short" style="display: inline;"> With the rapid development of Internet of Things (IoT) and artificial intelligence technologies, data has become an important strategic resource in the new era. However, the growing demand for data has exacerbated the issue of \textit{data silos}. Existing data pricing models primarily focus on single factors such as data quality or market demand, failing to adequately address issues such as data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16363v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16363v1-abstract-full" style="display: none;"> With the rapid development of Internet of Things (IoT) and artificial intelligence technologies, data has become an important strategic resource in the new era. However, the growing demand for data has exacerbated the issue of \textit{data silos}. Existing data pricing models primarily focus on single factors such as data quality or market demand, failing to adequately address issues such as data seller monopolies and the diverse needs of buyers, resulting in biased pricing that cannot meet the complexities of evolving transaction scenarios. To address these problems, this paper proposes a multi-party data pricing model based on the Rubinstein bargaining model. The model introduces buyer data utility indicators and data quality assessments, comprehensively considering factors such as the utility, accuracy, and timeliness of data sets, to more accurately evaluate their value to buyers. To overcome the limitations of single-factor models, this paper innovatively introduces the buyer data set satisfaction indicator, which reflects the overall satisfaction of buyers with data sets by integrating both data utility and quality assessments. Based on this, the model uses the Rubinstein bargaining model to simulate the pricing process between multiple sellers and multiple buyers, yielding pricing results that better align with market demands. Experimental results show that the proposed model effectively addresses the pricing imbalance caused by data monopolies and demonstrates good applicability and accuracy in multi-seller, multi-buyer transaction environments. This research provides an effective pricing mechanism for complex data trading markets and has significant theoretical and practical value in solving pricing issues in actual data transactions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16363v1-abstract-full').style.display = 'none'; document.getElementById('2502.16363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15859">arXiv:2502.15859</a> <span> [<a href="https://arxiv.org/pdf/2502.15859">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AI Governance InternationaL Evaluation Index (AGILE Index) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yi Zeng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+E">Enmeng Lu</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xin Guan</a>, <a href="/search/cs?searchtype=author&query=Huangfu%2C+C">Cunqing Huangfu</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+Z">Zizhe Ruan</a>, <a href="/search/cs?searchtype=author&query=Younas%2C+A">Ammar Younas</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+K">Kang Sun</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xuan Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuwei Wang</a>, <a href="/search/cs?searchtype=author&query=Suo%2C+H">Hongjie Suo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+D">Dongqi Liang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhengqiang Han</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+A">Aorigele Bao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xiaoyang Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jin Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jiawei Xie</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yao Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15859v3-abstract-short" style="display: inline;"> The rapid advancement of Artificial Intelligence (AI) technology is profoundly transforming human society and concurrently presenting a series of ethical, legal, and social issues. The effective governance of AI has become a crucial global concern. Since 2022, the extensive deployment of generative AI, particularly large language models, marked a new phase in AI governance. Continuous efforts are… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15859v3-abstract-full').style.display = 'inline'; document.getElementById('2502.15859v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15859v3-abstract-full" style="display: none;"> The rapid advancement of Artificial Intelligence (AI) technology is profoundly transforming human society and concurrently presenting a series of ethical, legal, and social issues. The effective governance of AI has become a crucial global concern. Since 2022, the extensive deployment of generative AI, particularly large language models, marked a new phase in AI governance. Continuous efforts are being made by the international community in actively addressing the novel challenges posed by these AI developments. As consensus on international governance continues to be established and put into action, the practical importance of conducting a global assessment of the state of AI governance is progressively coming to light. In this context, we initiated the development of the AI Governance InternationaL Evaluation Index (AGILE Index). Adhering to the design principle, "the level of governance should match the level of development," the inaugural evaluation of the AGILE Index commences with an exploration of four foundational pillars: the development level of AI, the AI governance environment, the AI governance instruments, and the AI governance effectiveness. It covers 39 indicators across 18 dimensions to comprehensively assess the AI governance level of 14 representative countries globally. The index is utilized to delve into the status of AI governance to date in 14 countries for the first batch of evaluation. The aim is to depict the current state of AI governance in these countries through data scoring, assist them in identifying their governance stage and uncovering governance issues, and ultimately offer insights for the enhancement of their AI governance systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15859v3-abstract-full').style.display = 'none'; document.getElementById('2502.15859v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Evaluation Report. 85 pages, 30 Figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T01 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> A.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14694">arXiv:2502.14694</a> <span> [<a href="https://arxiv.org/pdf/2502.14694">pdf</a>, <a href="https://arxiv.org/format/2502.14694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Near-Far Field Boundary in Dual-Polarized XL-MIMO Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shuhao Zeng</a>, <a href="/search/cs?searchtype=author&query=Di%2C+B">Boya Di</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14694v1-abstract-short" style="display: inline;"> Extremely large-scale multiple-input multiple-output (XL-MIMO) is expected to be an important technology in future sixth generation (6G) networks. Compared with conventional single-polarized XL-MIMO, where signals are transmitted and received in only one polarization direction, dual-polarized XL-MIMO systems achieve higher data rate by improving multiplexing performances, and thus are the focus of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14694v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14694v1-abstract-full" style="display: none;"> Extremely large-scale multiple-input multiple-output (XL-MIMO) is expected to be an important technology in future sixth generation (6G) networks. Compared with conventional single-polarized XL-MIMO, where signals are transmitted and received in only one polarization direction, dual-polarized XL-MIMO systems achieve higher data rate by improving multiplexing performances, and thus are the focus of this paper. Due to enlarged aperture, near-field regions become non-negligible in XL-MIMO communications, necessitating accurate near-far field boundary characterizations. However, existing boundaries developed for single-polarized systems only consider phase or power differences across array elements while irrespective of cross-polarization discrimination (XPD) variances in dual-polarized XL-MIMO systems, deteriorating transmit covariance optimization performances. In this paper, we revisit near-far field boundaries for dual-polarized XL-MIMO systems by taking XPD differences into account, which faces the following challenge. Unlike existing near-far field boundaries, which only need to consider co-polarized channel components, deriving boundaries for dual-polarized XL-MIMO systems requires modeling joint effects of co-polarized and cross-polarized components. To address this issue, we model XPD variations across antennas and introduce a non-uniform XPD distance to complement existing near-far field boundaries. Based on the new distance criterion, we propose an efficient scheme to optimize transmit covariance. Numerical results validate our analysis and demonstrate the proposed algorithm's effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14694v1-abstract-full').style.display = 'none'; document.getElementById('2502.14694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12379">arXiv:2502.12379</a> <span> [<a href="https://arxiv.org/pdf/2502.12379">pdf</a>, <a href="https://arxiv.org/format/2502.12379">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OCT Data is All You Need: How Vision Transformers with and without Pre-training Benefit Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zihao Han</a>, <a href="/search/cs?searchtype=author&query=De+Wilde%2C+P">Philippe De Wilde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12379v1-abstract-short" style="display: inline;"> Optical Coherence Tomography (OCT) provides high-resolution cross-sectional images useful for diagnosing various diseases, but their distinct characteristics from natural images raise questions about whether large-scale pre-training on datasets like ImageNet is always beneficial. In this paper, we investigate the impact of ImageNet-based pre-training on Vision Transformer (ViT) performance for OCT… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12379v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12379v1-abstract-full" style="display: none;"> Optical Coherence Tomography (OCT) provides high-resolution cross-sectional images useful for diagnosing various diseases, but their distinct characteristics from natural images raise questions about whether large-scale pre-training on datasets like ImageNet is always beneficial. In this paper, we investigate the impact of ImageNet-based pre-training on Vision Transformer (ViT) performance for OCT image classification across different dataset sizes. Our experiments cover four-category retinal pathologies (CNV, DME, Drusen, Normal). Results suggest that while pre-training can accelerate convergence and potentially offer better performance in smaller datasets, training from scratch may achieve comparable or even superior accuracy when sufficient OCT data is available. Our findings highlight the importance of matching domain characteristics in pre-training and call for further study on large-scale OCT-specific pre-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12379v1-abstract-full').style.display = 'none'; document.getElementById('2502.12379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10881">arXiv:2502.10881</a> <span> [<a href="https://arxiv.org/pdf/2502.10881">pdf</a>, <a href="https://arxiv.org/format/2502.10881">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CiteCheck: Towards Accurate Citation Faithfulness Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Ziyao Xu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+S">Shaohang Wei</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhuoheng Han</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jing Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoguang Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Haochen Tan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhijiang Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Houfeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10881v1-abstract-short" style="display: inline;"> Citation faithfulness detection is critical for enhancing retrieval-augmented generation (RAG) systems, yet large-scale Chinese datasets for this task are scarce. Existing methods face prohibitive costs due to the need for manually annotated negative samples. To address this, we introduce the first large-scale Chinese dataset CiteCheck for citation faithfulness detection, constructed via a cost-ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10881v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10881v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10881v1-abstract-full" style="display: none;"> Citation faithfulness detection is critical for enhancing retrieval-augmented generation (RAG) systems, yet large-scale Chinese datasets for this task are scarce. Existing methods face prohibitive costs due to the need for manually annotated negative samples. To address this, we introduce the first large-scale Chinese dataset CiteCheck for citation faithfulness detection, constructed via a cost-effective approach using two-stage manual annotation. This method balances positive and negative samples while significantly reducing annotation expenses. CiteCheck comprises training and test splits. Experiments demonstrate that: (1) the test samples are highly challenging, with even state-of-the-art LLMs failing to achieve high accuracy; and (2) training data augmented with LLM-generated negative samples enables smaller models to attain strong performance using parameter-efficient fine-tuning. CiteCheck provides a robust foundation for advancing citation faithfulness detection in Chinese RAG systems. The dataset is publicly available to facilitate research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10881v1-abstract-full').style.display = 'none'; document.getElementById('2502.10881v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10731">arXiv:2502.10731</a> <span> [<a href="https://arxiv.org/pdf/2502.10731">pdf</a>, <a href="https://arxiv.org/ps/2502.10731">ps</a>, <a href="https://arxiv.org/format/2502.10731">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Service Function Chain Dynamic Scheduling in Space-Air-Ground Integrated Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Z">Ziye Jia</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yilu Cao</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lijun He</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qihui Wu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiuming Zhu</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10731v2-abstract-short" style="display: inline;"> As an important component of the sixth generation communication technologies, the space-air-ground integrated network (SAGIN) attracts increasing attentions in recent years. However, due to the mobility and heterogeneity of the components such as satellites and unmanned aerial vehicles in multi-layer SAGIN, the challenges of inefficient resource allocation and management complexity are aggregated.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10731v2-abstract-full').style.display = 'inline'; document.getElementById('2502.10731v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10731v2-abstract-full" style="display: none;"> As an important component of the sixth generation communication technologies, the space-air-ground integrated network (SAGIN) attracts increasing attentions in recent years. However, due to the mobility and heterogeneity of the components such as satellites and unmanned aerial vehicles in multi-layer SAGIN, the challenges of inefficient resource allocation and management complexity are aggregated. To this end, the network function virtualization technology is introduced and can be implemented via service function chains (SFCs) deployment. However, urgent unexpected tasks may bring conflicts and resource competition during SFC deployment, and how to schedule the SFCs of multiple tasks in SAGIN is a key issue. In this paper, we address the dynamic and complexity of SAGIN by presenting a reconfigurable time extension graph and further propose the dynamic SFC scheduling model. Then, we formulate the SFC scheduling problem to maximize the number of successful deployed SFCs within limited resources and time horizons. Since the problem is in the form of integer linear programming and intractable to solve, we propose the algorithm by incorporating deep reinforcement learning. Finally, simulation results show that the proposed algorithm has better convergence and performance compared to other benchmark algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10731v2-abstract-full').style.display = 'none'; document.getElementById('2502.10731v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09471">arXiv:2502.09471</a> <span> [<a href="https://arxiv.org/pdf/2502.09471">pdf</a>, <a href="https://arxiv.org/format/2502.09471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Wholly-WOOD: Wholly Leveraging Diversified-quality Labels for Weakly-supervised Oriented Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xue Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yansheng Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhenjun Han</a>, <a href="/search/cs?searchtype=author&query=Da%2C+F">Feipeng Da</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junchi Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09471v1-abstract-short" style="display: inline;"> Accurately estimating the orientation of visual objects with compact rotated bounding boxes (RBoxes) has become a prominent demand, which challenges existing object detection paradigms that only use horizontal bounding boxes (HBoxes). To equip the detectors with orientation awareness, supervised regression/classification modules have been introduced at the high cost of rotation annotation. Meanwhi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09471v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09471v1-abstract-full" style="display: none;"> Accurately estimating the orientation of visual objects with compact rotated bounding boxes (RBoxes) has become a prominent demand, which challenges existing object detection paradigms that only use horizontal bounding boxes (HBoxes). To equip the detectors with orientation awareness, supervised regression/classification modules have been introduced at the high cost of rotation annotation. Meanwhile, some existing datasets with oriented objects are already annotated with horizontal boxes or even single points. It becomes attractive yet remains open for effectively utilizing weaker single point and horizontal annotations to train an oriented object detector (OOD). We develop Wholly-WOOD, a weakly-supervised OOD framework, capable of wholly leveraging various labeling forms (Points, HBoxes, RBoxes, and their combination) in a unified fashion. By only using HBox for training, our Wholly-WOOD achieves performance very close to that of the RBox-trained counterpart on remote sensing and other areas, significantly reducing the tedious efforts on labor-intensive annotation for oriented objects. The source codes are available at https://github.com/VisionXLab/whollywood (PyTorch-based) and https://github.com/VisionXLab/whollywood-jittor (Jittor-based). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09471v1-abstract-full').style.display = 'none'; document.getElementById('2502.09471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 9 figures, 9 tables, accepted by TPAMI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08119">arXiv:2502.08119</a> <span> [<a href="https://arxiv.org/pdf/2502.08119">pdf</a>, <a href="https://arxiv.org/format/2502.08119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Generative AI-Enhanced Cooperative MEC of UAVs and Ground Stations for Unmanned Surface Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=You%2C+J">Jiahao You</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Z">Ziye Jia</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+C">Chao Dong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qihui Wu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08119v1-abstract-short" style="display: inline;"> The increasing deployment of unmanned surface vehicles (USVs) require computational support and coverage in applications such as maritime search and rescue. Unmanned aerial vehicles (UAVs) can offer low-cost, flexible aerial services, and ground stations (GSs) can provide powerful supports, which can cooperate to help the USVs in complex scenarios. However, the collaboration between UAVs and GSs f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08119v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08119v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08119v1-abstract-full" style="display: none;"> The increasing deployment of unmanned surface vehicles (USVs) require computational support and coverage in applications such as maritime search and rescue. Unmanned aerial vehicles (UAVs) can offer low-cost, flexible aerial services, and ground stations (GSs) can provide powerful supports, which can cooperate to help the USVs in complex scenarios. However, the collaboration between UAVs and GSs for USVs faces challenges of task uncertainties, USVs trajectory uncertainties, heterogeneities, and limited computational resources. To address these issues, we propose a cooperative UAV and GS based robust multi-access edge computing framework to assist USVs in completing computational tasks. Specifically, we formulate the optimization problem of joint task offloading and UAV trajectory to minimize the total execution time, which is in the form of mixed integer nonlinear programming and NP-hard to tackle. Therefore, we propose the algorithm of generative artificial intelligence-enhanced heterogeneous agent proximal policy optimization (GAI-HAPPO). The proposed algorithm integrates GAI models to enhance the actor network ability to model complex environments and extract high-level features, thereby allowing the algorithm to predict uncertainties and adapt to dynamic conditions. Additionally, GAI stabilizes the critic network, addressing the instability of multi-agent reinforcement learning approaches. Finally, extensive simulations demonstrate that the proposed algorithm outperforms the existing benchmark methods, thus highlighting the potentials in tackling intricate, cross-domain issues in the considered scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08119v1-abstract-full').style.display = 'none'; document.getElementById('2502.08119v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05264">arXiv:2502.05264</a> <span> [<a href="https://arxiv.org/pdf/2502.05264">pdf</a>, <a href="https://arxiv.org/format/2502.05264">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Quantum automated learning with provable and explainable trainability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+Q">Qi Ye</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+S">Shuangyue Geng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zizhao Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weikang Li</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+L+-">L. -M. Duan</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+D">Dong-Ling Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05264v1-abstract-short" style="display: inline;"> Machine learning is widely believed to be one of the most promising practical applications of quantum computing. Existing quantum machine learning schemes typically employ a quantum-classical hybrid approach that relies crucially on gradients of model parameters. Such an approach lacks provable convergence to global minima and will become infeasible as quantum learning models scale up. Here, we in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05264v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05264v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05264v1-abstract-full" style="display: none;"> Machine learning is widely believed to be one of the most promising practical applications of quantum computing. Existing quantum machine learning schemes typically employ a quantum-classical hybrid approach that relies crucially on gradients of model parameters. Such an approach lacks provable convergence to global minima and will become infeasible as quantum learning models scale up. Here, we introduce quantum automated learning, where no variational parameter is involved and the training process is converted to quantum state preparation. In particular, we encode training data into unitary operations and iteratively evolve a random initial state under these unitaries and their inverses, with a target-oriented perturbation towards higher prediction accuracy sandwiched in between. Under reasonable assumptions, we rigorously prove that the evolution converges exponentially to the desired state corresponding to the global minimum of the loss function. We show that such a training process can be understood from the perspective of preparing quantum states by imaginary time evolution, where the data-encoded unitaries together with target-oriented perturbations would train the quantum learning model in an automated fashion. We further prove that the quantum automated learning paradigm features good generalization ability with the generalization error upper bounded by the ratio between a logarithmic function of the Hilbert space dimension and the number of training samples. In addition, we carry out extensive numerical simulations on real-life images and quantum data to demonstrate the effectiveness of our approach and validate the assumptions. Our results establish an unconventional quantum learning strategy that is gradient-free with provable and explainable trainability, which would be crucial for large-scale practical applications of quantum computing in machine learning scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05264v1-abstract-full').style.display = 'none'; document.getElementById('2502.05264v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02141">arXiv:2502.02141</a> <span> [<a href="https://arxiv.org/pdf/2502.02141">pdf</a>, <a href="https://arxiv.org/ps/2502.02141">ps</a>, <a href="https://arxiv.org/format/2502.02141">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> NFV-Enabled Service Recovery in Space-Air-Ground Integrated Networks: A Matching Game Based Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Z">Ziye Jia</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yilu Cao</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lijun He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guangxia Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Fuhui Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qihui Wu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02141v1-abstract-short" style="display: inline;"> To achieve ubiquitous connectivity of the sixth generation communication, the space-air-ground integrated network (SAGIN) is a popular topic. However, the dynamic nodes in SAGIN such as satellites and unmanned aerial vehicles, may be fragile and out of operation, which can potentially cause service failure. Therefore, the research on service recovery in SAGIN under situations of resource failure i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02141v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02141v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02141v1-abstract-full" style="display: none;"> To achieve ubiquitous connectivity of the sixth generation communication, the space-air-ground integrated network (SAGIN) is a popular topic. However, the dynamic nodes in SAGIN such as satellites and unmanned aerial vehicles, may be fragile and out of operation, which can potentially cause service failure. Therefore, the research on service recovery in SAGIN under situations of resource failure is critical. In order to facilitate the flexible resource utilization of SAGIN, the network function virtualization technology (NFV) is proposed to be employed. Firstly, the task management is transformed into the deployment of service function chains (SFCs). Then, we design an NFV-based SFC recovery model in SAGIN in the face of resource failure, so that tasks can quickly select alternative resources to complete deployments. Moreover, the problem of SFC recovery is formulated to minimize the total time consumption for all completed SFCs. Since it is an NP-hard integer linear programming problem, we propose the efficient recovery algorithm based on the matching game. Finally, via various simulations, the effectiveness of the proposed algorithm and its advantages are verified, where the total time consumption is optimized by about 25%, compared with other benchmark methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02141v1-abstract-full').style.display = 'none'; document.getElementById('2502.02141v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00463">arXiv:2502.00463</a> <span> [<a href="https://arxiv.org/pdf/2502.00463">pdf</a>, <a href="https://arxiv.org/format/2502.00463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Efficient Over-parameterized Matrix Sensing from Noisy Measurements via Alternating Preconditioned Gradient Descent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyu Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhi Han</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yandong Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hai Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shaojie Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00463v2-abstract-short" style="display: inline;"> We consider the noisy matrix sensing problem in the over-parameterization setting, where the estimated rank $r$ is larger than the true rank $r_\star$. Specifically, our main objective is to recover a matrix $ X_\star \in \mathbb{R}^{n_1 \times n_2} $ with rank $ r_\star $ from noisy measurements using an over-parameterized factorized form $ LR^\top $, where… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00463v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00463v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00463v2-abstract-full" style="display: none;"> We consider the noisy matrix sensing problem in the over-parameterization setting, where the estimated rank $r$ is larger than the true rank $r_\star$. Specifically, our main objective is to recover a matrix $ X_\star \in \mathbb{R}^{n_1 \times n_2} $ with rank $ r_\star $ from noisy measurements using an over-parameterized factorized form $ LR^\top $, where $ L \in \mathbb{R}^{n_1 \times r}, \, R \in \mathbb{R}^{n_2 \times r} $ and $ \min\{n_1, n_2\} \ge r > r_\star $, with the true rank $ r_\star $ being unknown. Recently, preconditioning methods have been proposed to accelerate the convergence of matrix sensing problem compared to vanilla gradient descent, incorporating preconditioning terms $ (L^\top L + 位I)^{-1} $ and $ (R^\top R + 位I)^{-1} $ into the original gradient. However, these methods require careful tuning of the damping parameter $位$ and are sensitive to initial points and step size. To address these limitations, we propose the alternating preconditioned gradient descent (APGD) algorithm, which alternately updates the two factor matrices, eliminating the need for the damping parameter and enabling faster convergence with larger step sizes. We theoretically prove that APGD achieves near-optimal error convergence at a linear rate, starting from arbitrary random initializations. Through extensive experiments, we validate our theoretical results and demonstrate that APGD outperforms other methods, achieving the fastest convergence rate. Notably, both our theoretical analysis and experimental results illustrate that APGD does not rely on the initialization procedure, making it more practical and versatile. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00463v2-abstract-full').style.display = 'none'; document.getElementById('2502.00463v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00354">arXiv:2502.00354</a> <span> [<a href="https://arxiv.org/pdf/2502.00354">pdf</a>, <a href="https://arxiv.org/format/2502.00354">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696410.3714561">10.1145/3696410.3714561 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> PM-MOE: Mixture of Experts on Private Model Parameters for Personalized Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yu Feng</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Y">Yangli-ao Geng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yifan Zhu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zongfu Han</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xie Yu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+K">Kaiwen Xue</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haoran Luo</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mengyang Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guangwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Meina Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00354v1-abstract-short" style="display: inline;"> Federated learning (FL) has gained widespread attention for its privacy-preserving and collaborative learning capabilities. Due to significant statistical heterogeneity, traditional FL struggles to generalize a shared model across diverse data domains. Personalized federated learning addresses this issue by dividing the model into a globally shared part and a locally private part, with the local m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00354v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00354v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00354v1-abstract-full" style="display: none;"> Federated learning (FL) has gained widespread attention for its privacy-preserving and collaborative learning capabilities. Due to significant statistical heterogeneity, traditional FL struggles to generalize a shared model across diverse data domains. Personalized federated learning addresses this issue by dividing the model into a globally shared part and a locally private part, with the local model correcting representation biases introduced by the global model. Nevertheless, locally converged parameters more accurately capture domain-specific knowledge, and current methods overlook the potential benefits of these parameters. To address these limitations, we propose PM-MoE architecture. This architecture integrates a mixture of personalized modules and an energy-based personalized modules denoising, enabling each client to select beneficial personalized parameters from other clients. We applied the PM-MoE architecture to nine recent model-split-based personalized federated learning algorithms, achieving performance improvements with minimal additional training. Extensive experiments on six widely adopted datasets and two heterogeneity settings validate the effectiveness of our approach. The source code is available at \url{https://github.com/dannis97500/PM-MOE}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00354v1-abstract-full').style.display = 'none'; document.getElementById('2502.00354v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14249">arXiv:2501.14249</a> <span> [<a href="https://arxiv.org/pdf/2501.14249">pdf</a>, <a href="https://arxiv.org/format/2501.14249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Humanity's Last Exam </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Phan%2C+L">Long Phan</a>, <a href="/search/cs?searchtype=author&query=Gatti%2C+A">Alice Gatti</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Ziwen Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nathaniel Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Josephina Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hugh Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C+B+C">Chen Bo Calvin Zhang</a>, <a href="/search/cs?searchtype=author&query=Shaaban%2C+M">Mohamed Shaaban</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+J">John Ling</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Sean Shi</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+M">Michael Choi</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+A">Anish Agrawal</a>, <a href="/search/cs?searchtype=author&query=Chopra%2C+A">Arnav Chopra</a>, <a href="/search/cs?searchtype=author&query=Khoja%2C+A">Adam Khoja</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+R">Ryan Kim</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+R">Richard Ren</a>, <a href="/search/cs?searchtype=author&query=Hausenloy%2C+J">Jason Hausenloy</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+O">Oliver Zhang</a>, <a href="/search/cs?searchtype=author&query=Mazeika%2C+M">Mantas Mazeika</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Tung Nguyen</a>, <a href="/search/cs?searchtype=author&query=Anderson%2C+D">Daron Anderson</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+I+A">Imad Ali Shah</a>, <a href="/search/cs?searchtype=author&query=Doroshenko%2C+M">Mikhail Doroshenko</a>, <a href="/search/cs?searchtype=author&query=Stokes%2C+A+C">Alun Cennyth Stokes</a>, <a href="/search/cs?searchtype=author&query=Mahmood%2C+M">Mobeen Mahmood</a> , et al. (709 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14249v5-abstract-short" style="display: inline;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity's Last Exam (HLE), a multi-modal benchmark at the frontier of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v5-abstract-full').style.display = 'inline'; document.getElementById('2501.14249v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14249v5-abstract-full" style="display: none;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity's Last Exam (HLE), a multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage. HLE consists of 2,700 questions across dozens of subjects, including mathematics, humanities, and the natural sciences. HLE is developed globally by subject-matter experts and consists of multiple-choice and short-answer questions suitable for automated grading. Each question has a known solution that is unambiguous and easily verifiable, but cannot be quickly answered via internet retrieval. State-of-the-art LLMs demonstrate low accuracy and calibration on HLE, highlighting a significant gap between current LLM capabilities and the expert human frontier on closed-ended academic questions. To inform research and policymaking upon a clear understanding of model capabilities, we publicly release HLE at https://lastexam.ai. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v5-abstract-full').style.display = 'none'; document.getElementById('2501.14249v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11074">arXiv:2501.11074</a> <span> [<a href="https://arxiv.org/pdf/2501.11074">pdf</a>, <a href="https://arxiv.org/format/2501.11074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Achieving Network Resilience through Graph Neural Network-enabled Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuzeng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhen Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Jamalipour%2C+A">Abbas Jamalipour</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11074v1-abstract-short" style="display: inline;"> Deep reinforcement learning (DRL) has been widely used in many important tasks of communication networks. In order to improve the perception ability of DRL on the network, some studies have combined graph neural networks (GNNs) with DRL, which use the GNNs to extract unstructured features of the network. However, as networks continue to evolve and become increasingly complex, existing GNN-DRL meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11074v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11074v1-abstract-full" style="display: none;"> Deep reinforcement learning (DRL) has been widely used in many important tasks of communication networks. In order to improve the perception ability of DRL on the network, some studies have combined graph neural networks (GNNs) with DRL, which use the GNNs to extract unstructured features of the network. However, as networks continue to evolve and become increasingly complex, existing GNN-DRL methods still face challenges in terms of scalability and robustness. Moreover, these methods are inadequate for addressing network security issues. From the perspective of security and robustness, this paper explores the solution of combining GNNs with DRL to build a resilient network. This article starts with a brief tutorial of GNNs and DRL, and introduces their existing applications in networks. Furthermore, we introduce the network security methods that can be strengthened by GNN-DRL approaches. Then, we designed a framework based on GNN-DRL to defend against attacks and enhance network resilience. Additionally, we conduct a case study using an encrypted traffic dataset collected from real IoT environments, and the results demonstrated the effectiveness and superiority of our framework. Finally, we highlight key open challenges and opportunities for enhancing network resilience with GNN-DRL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11074v1-abstract-full').style.display = 'none'; document.getElementById('2501.11074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09428">arXiv:2501.09428</a> <span> [<a href="https://arxiv.org/pdf/2501.09428">pdf</a>, <a href="https://arxiv.org/format/2501.09428">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AugRefer: Advancing 3D Visual Grounding via Cross-Modal Augmentation and Spatial Relation-based Referring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+N">Na Zhao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhiyuan Han</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xun Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09428v1-abstract-short" style="display: inline;"> 3D visual grounding (3DVG), which aims to correlate a natural language description with the target object within a 3D scene, is a significant yet challenging task. Despite recent advancements in this domain, existing approaches commonly encounter a shortage: a limited amount and diversity of text3D pairs available for training. Moreover, they fall short in effectively leveraging different contextu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09428v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09428v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09428v1-abstract-full" style="display: none;"> 3D visual grounding (3DVG), which aims to correlate a natural language description with the target object within a 3D scene, is a significant yet challenging task. Despite recent advancements in this domain, existing approaches commonly encounter a shortage: a limited amount and diversity of text3D pairs available for training. Moreover, they fall short in effectively leveraging different contextual clues (e.g., rich spatial relations within the 3D visual space) for grounding. To address these limitations, we propose AugRefer, a novel approach for advancing 3D visual grounding. AugRefer introduces cross-modal augmentation designed to extensively generate diverse text-3D pairs by placing objects into 3D scenes and creating accurate and semantically rich descriptions using foundation models. Notably, the resulting pairs can be utilized by any existing 3DVG methods for enriching their training data. Additionally, AugRefer presents a language-spatial adaptive decoder that effectively adapts the potential referring objects based on the language description and various 3D spatial relations. Extensive experiments on three benchmark datasets clearly validate the effectiveness of AugRefer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09428v1-abstract-full').style.display = 'none'; document.getElementById('2501.09428v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06880">arXiv:2501.06880</a> <span> [<a href="https://arxiv.org/pdf/2501.06880">pdf</a>, <a href="https://arxiv.org/format/2501.06880">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Neural-Enhancement for Online Cloud Gaming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Shan Jiang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhenhua Han</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Haisheng Tan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xinyang Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoxi Zhang</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+H">Hongqiu Ni</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuqing Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang-Yang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06880v1-abstract-short" style="display: inline;"> Online Cloud gaming demands real-time, high-quality video transmission across variable wide-area networks (WANs). Neural-enhanced video transmission algorithms employing super-resolution (SR) for video quality enhancement have effectively challenged WAN environments. However, these SR-based methods require intensive fine-tuning for the whole video, making it infeasible in diverse online cloud gami… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06880v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06880v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06880v1-abstract-full" style="display: none;"> Online Cloud gaming demands real-time, high-quality video transmission across variable wide-area networks (WANs). Neural-enhanced video transmission algorithms employing super-resolution (SR) for video quality enhancement have effectively challenged WAN environments. However, these SR-based methods require intensive fine-tuning for the whole video, making it infeasible in diverse online cloud gaming. To address this, we introduce River, a cloud gaming delivery framework designed based on the observation that video segment features in cloud gaming are typically repetitive and redundant. This permits a significant opportunity to reuse fine-tuned SR models, reducing the fine-tuning latency of minutes to query latency of milliseconds. To enable the idea, we design a practical system that addresses several challenges, such as model organization, online model scheduler, and transfer strategy. River first builds a content-aware encoder that fine-tunes SR models for diverse video segments and stores them in a lookup table. When delivering cloud gaming video streams online, River checks the video features and retrieves the most relevant SR models to enhance the frame quality. Meanwhile, if no existing SR model performs well enough for some video segments, River will further fine-tune new models and update the lookup table. Finally, to avoid the overhead of streaming model weight to the clients, River designs a prefetching strategy that predicts the models with the highest possibility of being retrieved. Our evaluation based on real video game streaming demonstrates River can reduce redundant training overhead by 44% and improve the Peak-Signal-to-Noise-Ratio by 1.81dB compared to the SOTA solutions. Practical deployment shows River meets real-time requirements, achieving approximately 720p 20fps on mobile devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06880v1-abstract-full').style.display = 'none'; document.getElementById('2501.06880v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06526">arXiv:2501.06526</a> <span> [<a href="https://arxiv.org/pdf/2501.06526">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Advancements in UAV-based Integrated Sensing and Communication: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ahmed%2C+M">Manzoor Ahmed</a>, <a href="/search/cs?searchtype=author&query=Nasir%2C+A+A">Ali Arshad Nasir</a>, <a href="/search/cs?searchtype=author&query=Masood%2C+M">Mudassir Masood</a>, <a href="/search/cs?searchtype=author&query=Memon%2C+K+A">Kamran Ali Memon</a>, <a href="/search/cs?searchtype=author&query=Qureshi%2C+K+K">Khurram Karim Qureshi</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+F">Feroz Khan</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+W+U">Wali Ullah Khan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fang Xu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06526v1-abstract-short" style="display: inline;"> Unmanned aerial vehicle (UAV)-based integrated sensing and communication (ISAC) systems are poised to revolutionize next-generation wireless networks by enabling simultaneous sensing and communication (S\&C). This survey comprehensively reviews UAV-ISAC systems, highlighting foundational concepts, key advancements, and future research directions. We explore recent advancements in UAV-based ISAC sy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06526v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06526v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06526v1-abstract-full" style="display: none;"> Unmanned aerial vehicle (UAV)-based integrated sensing and communication (ISAC) systems are poised to revolutionize next-generation wireless networks by enabling simultaneous sensing and communication (S\&C). This survey comprehensively reviews UAV-ISAC systems, highlighting foundational concepts, key advancements, and future research directions. We explore recent advancements in UAV-based ISAC systems from various perspectives and objectives, including advanced channel estimation (CE), beam tracking, and system throughput optimization under joint sensing and communication S\&C constraints. Additionally, we examine weighted sum rate (WSR) and sensing trade-offs, delay and age of information (AoI) minimization, energy efficiency (EE), and security enhancement. These applications highlight the potential of UAV-based ISAC systems to improve spectrum utilization, enhance communication reliability, reduce latency, and optimize energy consumption across diverse domains, including smart cities, disaster relief, and defense operations. The survey also features summary tables for comparative analysis of existing methodologies, emphasizing performance, limitations, and effectiveness in addressing various challenges. By synthesizing recent advancements and identifying open research challenges, this survey aims to be a valuable resource for developing efficient, adaptive, and secure UAV-based ISAC systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06526v1-abstract-full').style.display = 'none'; document.getElementById('2501.06526v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25, 6</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02952">arXiv:2501.02952</a> <span> [<a href="https://arxiv.org/pdf/2501.02952">pdf</a>, <a href="https://arxiv.org/format/2501.02952">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Online Collaborative Resource Allocation and Task Offloading for Multi-access Edge Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+G">Geng Sun</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Minghua Yuan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zemin Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D+I">Dong In Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02952v1-abstract-short" style="display: inline;"> Multi-access edge computing (MEC) is emerging as a promising paradigm to provide flexible computing services close to user devices (UDs). However, meeting the computation-hungry and delay-sensitive demands of UDs faces several challenges, including the resource constraints of MEC servers, inherent dynamic and complex features in the MEC system, and difficulty in dealing with the time-coupled and d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02952v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02952v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02952v1-abstract-full" style="display: none;"> Multi-access edge computing (MEC) is emerging as a promising paradigm to provide flexible computing services close to user devices (UDs). However, meeting the computation-hungry and delay-sensitive demands of UDs faces several challenges, including the resource constraints of MEC servers, inherent dynamic and complex features in the MEC system, and difficulty in dealing with the time-coupled and decision-coupled optimization. In this work, we first present an edge-cloud collaborative MEC architecture, where the MEC servers and cloud collaboratively provide offloading services for UDs. Moreover, we formulate an energy-efficient and delay-aware optimization problem (EEDAOP) to minimize the energy consumption of UDs under the constraints of task deadlines and long-term queuing delays. Since the problem is proved to be non-convex mixed integer nonlinear programming (MINLP), we propose an online joint communication resource allocation and task offloading approach (OJCTA). Specifically, we transform EEDAOP into a real-time optimization problem by employing the Lyapunov optimization framework. Then, to solve the real-time optimization problem, we propose a communication resource allocation and task offloading optimization method by employing the Tammer decomposition mechanism, convex optimization method, bilateral matching mechanism, and dependent rounding method. Simulation results demonstrate that the proposed OJCTA can achieve superior system performance compared to the benchmark approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02952v1-abstract-full').style.display = 'none'; document.getElementById('2501.02952v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02487">arXiv:2501.02487</a> <span> [<a href="https://arxiv.org/pdf/2501.02487">pdf</a>, <a href="https://arxiv.org/format/2501.02487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ACE++: Instruction-Based Image Creation and Editing via Context-Aware Content Filling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+C">Chaojie Mao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yulin Pan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zeyinzi Jiang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhen Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02487v3-abstract-short" style="display: inline;"> We report ACE++, an instruction-based diffusion framework that tackles various image generation and editing tasks. Inspired by the input format for the inpainting task proposed by FLUX.1-Fill-dev, we improve the Long-context Condition Unit (LCU) introduced in ACE and extend this input paradigm to any editing and generation tasks. To take full advantage of image generative priors, we develop a two-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02487v3-abstract-full').style.display = 'inline'; document.getElementById('2501.02487v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02487v3-abstract-full" style="display: none;"> We report ACE++, an instruction-based diffusion framework that tackles various image generation and editing tasks. Inspired by the input format for the inpainting task proposed by FLUX.1-Fill-dev, we improve the Long-context Condition Unit (LCU) introduced in ACE and extend this input paradigm to any editing and generation tasks. To take full advantage of image generative priors, we develop a two-stage training scheme to minimize the efforts of finetuning powerful text-to-image diffusion models like FLUX.1-dev. In the first stage, we pre-train the model using task data with the 0-ref tasks from the text-to-image model. There are many models in the community based on the post-training of text-to-image foundational models that meet this training paradigm of the first stage. For example, FLUX.1-Fill-dev deals primarily with painting tasks and can be used as an initialization to accelerate the training process. In the second stage, we finetune the above model to support the general instructions using all tasks defined in ACE. To promote the widespread application of ACE++ in different scenarios, we provide a comprehensive set of models that cover both full finetuning and lightweight finetuning, while considering general applicability and applicability in vertical scenarios. The qualitative analysis showcases the superiority of ACE++ in terms of generating image quality and prompt following ability. Code and models will be available on the project page: https://ali-vilab. github.io/ACE_plus_page/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02487v3-abstract-full').style.display = 'none'; document.getElementById('2501.02487v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01397">arXiv:2501.01397</a> <span> [<a href="https://arxiv.org/pdf/2501.01397">pdf</a>, <a href="https://arxiv.org/format/2501.01397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> WeAudit: Scaffolding User Auditors and AI Practitioners in Auditing Generative AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+W+H">Wesley Hanwen Deng</a>, <a href="/search/cs?searchtype=author&query=Claire%2C+W">Wang Claire</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H+Z">Howard Ziyu Han</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J+I">Jason I. Hong</a>, <a href="/search/cs?searchtype=author&query=Holstein%2C+K">Kenneth Holstein</a>, <a href="/search/cs?searchtype=author&query=Eslami%2C+M">Motahhare Eslami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01397v3-abstract-short" style="display: inline;"> There has been growing interest from both practitioners and researchers in engaging end users in AI auditing, to draw upon users' unique knowledge and lived experiences. However, we know little about how to effectively scaffold end users in auditing in ways that can generate actionable insights for AI practitioners. Through formative studies with both users and AI practitioners, we first identifie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01397v3-abstract-full').style.display = 'inline'; document.getElementById('2501.01397v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01397v3-abstract-full" style="display: none;"> There has been growing interest from both practitioners and researchers in engaging end users in AI auditing, to draw upon users' unique knowledge and lived experiences. However, we know little about how to effectively scaffold end users in auditing in ways that can generate actionable insights for AI practitioners. Through formative studies with both users and AI practitioners, we first identified a set of design goals to support user-engaged AI auditing. We then developed WeAudit, a workflow and system that supports end users in auditing AI both individually and collectively. We evaluated WeAudit through a three-week user study with user auditors and interviews with industry Generative AI practitioners. Our findings offer insights into how WeAudit supports users in noticing and reflecting upon potential AI harms and in articulating their findings in ways that industry practitioners can act upon. Based on our observations and feedback from both users and practitioners, we identify several opportunities to better support user engagement in AI auditing processes. We discuss implications for future research to support effective and responsible user engagement in AI auditing and red-teaming. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01397v3-abstract-full').style.display = 'none'; document.getElementById('2501.01397v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00320">arXiv:2501.00320</a> <span> [<a href="https://arxiv.org/pdf/2501.00320">pdf</a>, <a href="https://arxiv.org/format/2501.00320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Autonomous Alignment with Human Value on Altruism through Considerate Self-imagination and Theory of Mind </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tong%2C+H">Haibo Tong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+E">Enmeng Lu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yinqian Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhengqiang Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chao Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+F">Feifei Zhao</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yi Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00320v2-abstract-short" style="display: inline;"> With the widespread application of Artificial Intelligence (AI) in human society, enabling AI to autonomously align with human values has become a pressing issue to ensure its sustainable development and benefit to humanity. One of the most important aspects of aligning with human values is the necessity for agents to autonomously make altruistic, safe, and ethical decisions, considering and carin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00320v2-abstract-full').style.display = 'inline'; document.getElementById('2501.00320v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00320v2-abstract-full" style="display: none;"> With the widespread application of Artificial Intelligence (AI) in human society, enabling AI to autonomously align with human values has become a pressing issue to ensure its sustainable development and benefit to humanity. One of the most important aspects of aligning with human values is the necessity for agents to autonomously make altruistic, safe, and ethical decisions, considering and caring for human well-being. Current AI extremely pursues absolute superiority in certain tasks, remaining indifferent to the surrounding environment and other agents, which has led to numerous safety risks. Altruistic behavior in human society originates from humans' capacity for empathizing others, known as Theory of Mind (ToM), combined with predictive imaginative interactions before taking action to produce thoughtful and altruistic behaviors. Inspired by this, we are committed to endow agents with considerate self-imagination and ToM capabilities, driving them through implicit intrinsic motivations to autonomously align with human altruistic values. By integrating ToM within the imaginative space, agents keep an eye on the well-being of other agents in real time, proactively anticipate potential risks to themselves and others, and make thoughtful altruistic decisions that balance negative effects on the environment. The ancient Chinese story of Sima Guang Smashes the Vat illustrates the moral behavior of the young Sima Guang smashed a vat to save a child who had accidentally fallen into it, which is an excellent reference scenario for this paper. We design an experimental scenario similar to Sima Guang Smashes the Vat and its variants with different complexities, which reflects the trade-offs and comprehensive considerations between self-goals, altruistic rescue, and avoiding negative side effects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00320v2-abstract-full').style.display = 'none'; document.getElementById('2501.00320v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19720">arXiv:2412.19720</a> <span> [<a href="https://arxiv.org/pdf/2412.19720">pdf</a>, <a href="https://arxiv.org/format/2412.19720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sharpening Neural Implicit Functions with Frequency Consolidation Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chao Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Shen Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhizhong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19720v1-abstract-short" style="display: inline;"> Signed Distance Functions (SDFs) are vital implicit representations to represent high fidelity 3D surfaces. Current methods mainly leverage a neural network to learn an SDF from various supervisions including signed distances, 3D point clouds, or multi-view images. However, due to various reasons including the bias of neural network on low frequency content, 3D unaware sampling, sparsity in point… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19720v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19720v1-abstract-full" style="display: none;"> Signed Distance Functions (SDFs) are vital implicit representations to represent high fidelity 3D surfaces. Current methods mainly leverage a neural network to learn an SDF from various supervisions including signed distances, 3D point clouds, or multi-view images. However, due to various reasons including the bias of neural network on low frequency content, 3D unaware sampling, sparsity in point clouds, or low resolutions of images, neural implicit representations still struggle to represent geometries with high frequency components like sharp structures, especially for the ones learned from images or point clouds. To overcome this challenge, we introduce a method to sharpen a low frequency SDF observation by recovering its high frequency components, pursuing a sharper and more complete surface. Our key idea is to learn a mapping from a low frequency observation to a full frequency coverage in a data-driven manner, leading to a prior knowledge of shape consolidation in the frequency domain, dubbed frequency consolidation priors. To better generalize a learned prior to unseen shapes, we introduce to represent frequency components as embeddings and disentangle the embedding of the low frequency component from the embedding of the full frequency component. This disentanglement allows the prior to generalize on an unseen low frequency observation by simply recovering its full frequency embedding through a test-time self-reconstruction. Our evaluations under widely used benchmarks or real scenes show that our method can recover high frequency component and produce more accurate surfaces than the latest methods. The code, data, and pre-trained models are available at \url{https://github.com/chenchao15/FCP}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19720v1-abstract-full').style.display = 'none'; document.getElementById('2412.19720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19496">arXiv:2412.19496</a> <span> [<a href="https://arxiv.org/pdf/2412.19496">pdf</a>, <a href="https://arxiv.org/format/2412.19496">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-P$^2$A: A Multi-perspective Benchmark on Privacy Assessment for Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiangkui Cao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhouyu Han</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+S">Shiguang Shan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xilin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19496v1-abstract-short" style="display: inline;"> Large Vision-Language Models (LVLMs) exhibit impressive potential across various tasks but also face significant privacy risks, limiting their practical applications. Current researches on privacy assessment for LVLMs is limited in scope, with gaps in both assessment dimensions and privacy categories. To bridge this gap, we propose Multi-P$^2$A, a comprehensive benchmark for evaluating the privacy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19496v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19496v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19496v1-abstract-full" style="display: none;"> Large Vision-Language Models (LVLMs) exhibit impressive potential across various tasks but also face significant privacy risks, limiting their practical applications. Current researches on privacy assessment for LVLMs is limited in scope, with gaps in both assessment dimensions and privacy categories. To bridge this gap, we propose Multi-P$^2$A, a comprehensive benchmark for evaluating the privacy preservation capabilities of LVLMs in terms of privacy awareness and leakage. Privacy awareness measures the model's ability to recognize the privacy sensitivity of input data, while privacy leakage assesses the risk of the model unintentionally disclosing privacy information in its output. We design a range of sub-tasks to thoroughly evaluate the model's privacy protection offered by LVLMs. Multi-P$^2$A covers 26 categories of personal privacy, 15 categories of trade secrets, and 18 categories of state secrets, totaling 31,962 samples. Based on Multi-P$^2$A, we evaluate the privacy preservation capabilities of 21 open-source and 2 closed-source LVLMs. Our results reveal that current LVLMs generally pose a high risk of facilitating privacy breaches, with vulnerabilities varying across personal privacy, trade secret, and state secret. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19496v1-abstract-full').style.display = 'none'; document.getElementById('2412.19496v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19354">arXiv:2412.19354</a> <span> [<a href="https://arxiv.org/pdf/2412.19354">pdf</a>, <a href="https://arxiv.org/format/2412.19354">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Federated Hybrid Training and Self-Adversarial Distillation: Towards Robust Edge Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Adhikary%2C+A">Apurba Adhikary</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kitae Kim</a>, <a href="/search/cs?searchtype=author&query=Huh%2C+E">Eui-Nam Huh</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C+S">Choong Seon Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19354v1-abstract-short" style="display: inline;"> Federated learning (FL) is a distributed training technology that enhances data privacy in mobile edge networks by allowing data owners to collaborate without transmitting raw data to the edge server. However, data heterogeneity and adversarial attacks pose challenges to develop an unbiased and robust global model for edge deployment. To address this, we propose Federated hyBrid Adversarial traini… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19354v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19354v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19354v1-abstract-full" style="display: none;"> Federated learning (FL) is a distributed training technology that enhances data privacy in mobile edge networks by allowing data owners to collaborate without transmitting raw data to the edge server. However, data heterogeneity and adversarial attacks pose challenges to develop an unbiased and robust global model for edge deployment. To address this, we propose Federated hyBrid Adversarial training and self-adversarial disTillation (FedBAT), a new framework designed to improve both robustness and generalization of the global model. FedBAT seamlessly integrates hybrid adversarial training and self-adversarial distillation into the conventional FL framework from data augmentation and feature distillation perspectives. From a data augmentation perspective, we propose hybrid adversarial training to defend against adversarial attacks by balancing accuracy and robustness through a weighted combination of standard and adversarial training. From a feature distillation perspective, we introduce a novel augmentation-invariant adversarial distillation method that aligns local adversarial features of augmented images with their corresponding unbiased global clean features. This alignment can effectively mitigate bias from data heterogeneity while enhancing both the robustness and generalization of the global model. Extensive experimental results across multiple datasets demonstrate that FedBAT yields comparable or superior performance gains in improving robustness while maintaining accuracy compared to several baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19354v1-abstract-full').style.display = 'none'; document.getElementById('2412.19354v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16476">arXiv:2412.16476</a> <span> [<a href="https://arxiv.org/pdf/2412.16476">pdf</a>, <a href="https://arxiv.org/format/2412.16476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Query Quantized Neural SLAM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Sijia Jiang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+J">Jing Hua</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhizhong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16476v1-abstract-short" style="display: inline;"> Neural implicit representations have shown remarkable abilities in jointly modeling geometry, color, and camera poses in simultaneous localization and mapping (SLAM). Current methods use coordinates, positional encodings, or other geometry features as input to query neural implicit functions for signed distances and color which produce rendering errors to drive the optimization in overfitting imag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16476v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16476v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16476v1-abstract-full" style="display: none;"> Neural implicit representations have shown remarkable abilities in jointly modeling geometry, color, and camera poses in simultaneous localization and mapping (SLAM). Current methods use coordinates, positional encodings, or other geometry features as input to query neural implicit functions for signed distances and color which produce rendering errors to drive the optimization in overfitting image observations. However, due to the run time efficiency requirement in SLAM systems, we are merely allowed to conduct optimization on each frame in few iterations, which is far from enough for neural networks to overfit these queries. The underfitting usually results in severe drifts in camera tracking and artifacts in reconstruction. To resolve this issue, we propose query quantized neural SLAM which uses quantized queries to reduce variations of input for much easier and faster overfitting a frame. To this end, we quantize a query into a discrete representation with a set of codes, and only allow neural networks to observe a finite number of variations. This allows neural networks to become increasingly familiar with these codes after overfitting more and more previous frames. Moreover, we also introduce novel initialization, losses, and argumentation to stabilize the optimization with significant uncertainty in the early optimization stage, constrain the optimization space, and estimate camera poses more accurately. We justify the effectiveness of each design and report visual and numerical comparisons on widely used benchmarks to show our superiority over the latest methods in both reconstruction and camera tracking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16476v1-abstract-full').style.display = 'none'; document.getElementById('2412.16476v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be appeared at AAAI25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16467">arXiv:2412.16467</a> <span> [<a href="https://arxiv.org/pdf/2412.16467">pdf</a>, <a href="https://arxiv.org/format/2412.16467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sensing Surface Patches in Volume Rendering for Inferring Signed Distance Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Sijia Jiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tong Wu</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+J">Jing Hua</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhizhong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16467v1-abstract-short" style="display: inline;"> It is vital to recover 3D geometry from multi-view RGB images in many 3D computer vision tasks. The latest methods infer the geometry represented as a signed distance field by minimizing the rendering error on the field through volume rendering. However, it is still challenging to explicitly impose constraints on surfaces for inferring more geometry details due to the limited ability of sensing su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16467v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16467v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16467v1-abstract-full" style="display: none;"> It is vital to recover 3D geometry from multi-view RGB images in many 3D computer vision tasks. The latest methods infer the geometry represented as a signed distance field by minimizing the rendering error on the field through volume rendering. However, it is still challenging to explicitly impose constraints on surfaces for inferring more geometry details due to the limited ability of sensing surfaces in volume rendering. To resolve this problem, we introduce a method to infer signed distance functions (SDFs) with a better sense of surfaces through volume rendering. Using the gradients and signed distances, we establish a small surface patch centered at the estimated intersection along a ray by pulling points randomly sampled nearby. Hence, we are able to explicitly impose surface constraints on the sensed surface patch, such as multi-view photo consistency and supervision from depth or normal priors, through volume rendering. We evaluate our method by numerical and visual comparisons on scene benchmarks. Our superiority over the latest methods justifies our effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16467v1-abstract-full').style.display = 'none'; document.getElementById('2412.16467v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be appeared at AAAI25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16311">arXiv:2412.16311</a> <span> [<a href="https://arxiv.org/pdf/2412.16311">pdf</a>, <a href="https://arxiv.org/format/2412.16311">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> HybGRAG: Hybrid Retrieval-Augmented Generation on Textual and Relational Knowledge Bases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+M">Meng-Chieh Lee</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qi Zhu</a>, <a href="/search/cs?searchtype=author&query=Mavromatis%2C+C">Costas Mavromatis</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhen Han</a>, <a href="/search/cs?searchtype=author&query=Adeshina%2C+S">Soji Adeshina</a>, <a href="/search/cs?searchtype=author&query=Ioannidis%2C+V+N">Vassilis N. Ioannidis</a>, <a href="/search/cs?searchtype=author&query=Rangwala%2C+H">Huzefa Rangwala</a>, <a href="/search/cs?searchtype=author&query=Faloutsos%2C+C">Christos Faloutsos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16311v1-abstract-short" style="display: inline;"> Given a semi-structured knowledge base (SKB), where text documents are interconnected by relations, how can we effectively retrieve relevant information to answer user questions? Retrieval-Augmented Generation (RAG) retrieves documents to assist large language models (LLMs) in question answering; while Graph RAG (GRAG) uses structured knowledge bases as its knowledge source. However, many question… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16311v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16311v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16311v1-abstract-full" style="display: none;"> Given a semi-structured knowledge base (SKB), where text documents are interconnected by relations, how can we effectively retrieve relevant information to answer user questions? Retrieval-Augmented Generation (RAG) retrieves documents to assist large language models (LLMs) in question answering; while Graph RAG (GRAG) uses structured knowledge bases as its knowledge source. However, many questions require both textual and relational information from SKB - referred to as "hybrid" questions - which complicates the retrieval process and underscores the need for a hybrid retrieval method that leverages both information. In this paper, through our empirical analysis, we identify key insights that show why existing methods may struggle with hybrid question answering (HQA) over SKB. Based on these insights, we propose HybGRAG for HQA consisting of a retriever bank and a critic module, with the following advantages: (1) Agentic, it automatically refines the output by incorporating feedback from the critic module, (2) Adaptive, it solves hybrid questions requiring both textual and relational information with the retriever bank, (3) Interpretable, it justifies decision making with intuitive refinement path, and (4) Effective, it surpasses all baselines on HQA benchmarks. In experiments on the STaRK benchmark, HybGRAG achieves significant performance gains, with an average relative improvement in Hit@1 of 51%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16311v1-abstract-full').style.display = 'none'; document.getElementById('2412.16311v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15529">arXiv:2412.15529</a> <span> [<a href="https://arxiv.org/pdf/2412.15529">pdf</a>, <a href="https://arxiv.org/format/2412.15529">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> XRAG: eXamining the Core -- Benchmarking Foundational Components in Advanced Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+Q">Qianren Mao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yangyifei Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jinlong Zhang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+H">Hanwen Hao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhilong Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xiao Guan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhenting Huang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Weifeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuyu Guo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhentao Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qili Zhang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+S">Siyuan Tao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yujie Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junnan Liu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhixing Tan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jie Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xudong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richong Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianxin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15529v2-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) synergizes the retrieval of pertinent data with the generative capabilities of Large Language Models (LLMs), ensuring that the generated output is not only contextually relevant but also accurate and current. We introduce XRAG, an open-source, modular codebase that facilitates exhaustive evaluation of the performance of foundational components of advanced RAG m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15529v2-abstract-full').style.display = 'inline'; document.getElementById('2412.15529v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15529v2-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) synergizes the retrieval of pertinent data with the generative capabilities of Large Language Models (LLMs), ensuring that the generated output is not only contextually relevant but also accurate and current. We introduce XRAG, an open-source, modular codebase that facilitates exhaustive evaluation of the performance of foundational components of advanced RAG modules. These components are systematically categorized into four core phases: pre-retrieval, retrieval, post-retrieval, and generation. We systematically analyse them across reconfigured datasets, providing a comprehensive benchmark for their effectiveness. As the complexity of RAG systems continues to escalate, we underscore the critical need to identify potential failure points in RAG systems. We formulate a suite of experimental methodologies and diagnostic testing protocols to dissect the failure points inherent in RAG engineering. Subsequently, we proffer bespoke solutions aimed at bolstering the overall performance of these modules. Our work thoroughly evaluates the performance of advanced core components in RAG systems, providing insights into optimizations for prevalent failure points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15529v2-abstract-full').style.display = 'none'; document.getElementById('2412.15529v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13912">arXiv:2412.13912</a> <span> [<a href="https://arxiv.org/pdf/2412.13912">pdf</a>, <a href="https://arxiv.org/format/2412.13912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Energy-Efficient SLAM via Joint Design of Sensing, Communication, and Exploration Speed </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zidong Han</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+R">Ruibo Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bingpeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yi Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13912v1-abstract-short" style="display: inline;"> To support future spatial machine intelligence applications, lifelong simultaneous localization and mapping (SLAM) has drawn significant attentions. SLAM is usually realized based on various types of mobile robots performing simultaneous and continuous sensing and communication. This paper focuses on analyzing the energy efficiency of robot operation for lifelong SLAM by jointly considering sensin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13912v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13912v1-abstract-full" style="display: none;"> To support future spatial machine intelligence applications, lifelong simultaneous localization and mapping (SLAM) has drawn significant attentions. SLAM is usually realized based on various types of mobile robots performing simultaneous and continuous sensing and communication. This paper focuses on analyzing the energy efficiency of robot operation for lifelong SLAM by jointly considering sensing, communication and mechanical factors. The system model is built based on a robot equipped with a 2D light detection and ranging (LiDAR) and an odometry. The cloud point raw data as well as the odometry data are wirelessly transmitted to data center where real-time map reconstruction is realized based on an unsupervised deep learning based method. The sensing duration, transmit power, transmit duration and exploration speed are jointly optimized to minimize the energy consumption. Simulations and experiments demonstrate the performance of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13912v1-abstract-full').style.display = 'none'; document.getElementById('2412.13912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12387">arXiv:2412.12387</a> <span> [<a href="https://arxiv.org/pdf/2412.12387">pdf</a>, <a href="https://arxiv.org/format/2412.12387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Differential Privacy Preserving Distributed Quantum Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Hui Zhong</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+K">Keyi Ju</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiachen Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+X">Xiaoqi Qin</a>, <a href="/search/cs?searchtype=author&query=Ohtsuki%2C+T">Tomoaki Ohtsuki</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+M">Miao Pan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12387v2-abstract-short" style="display: inline;"> Existing quantum computers can only operate with hundreds of qubits in the Noisy Intermediate-Scale Quantum (NISQ) state, while quantum distributed computing (QDC) is regarded as a reliable way to address this limitation, allowing quantum computers to achieve their full computational potential. However, similar to classical distributed computing, QDC also faces the problem of privacy leakage. Exis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12387v2-abstract-full').style.display = 'inline'; document.getElementById('2412.12387v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12387v2-abstract-full" style="display: none;"> Existing quantum computers can only operate with hundreds of qubits in the Noisy Intermediate-Scale Quantum (NISQ) state, while quantum distributed computing (QDC) is regarded as a reliable way to address this limitation, allowing quantum computers to achieve their full computational potential. However, similar to classical distributed computing, QDC also faces the problem of privacy leakage. Existing research has introduced quantum differential privacy (QDP) for privacy protection in central quantum computing, but there is no dedicated privacy protection mechanisms for QDC. To fill this research gap, our paper introduces a novel concept called quantum R茅nyi differential privacy (QRDP), which incorporates the advantages of classical R茅nyi DP and is applicable in the QDC domain. Based on the new quantum R茅nyi divergence, QRDP provides delicate and flexible privacy protection by introducing parameter $伪$. In particular, the QRDP composition is well suited for QDC, since it allows for more precise control of the total privacy budget in scenarios requiring multiple quantum operations. We analyze a variety of noise mechanisms that can implement QRDP, and derive the lowest privacy budget provided by these mechanisms. Finally, we investigate the impact of different quantum parameters on QRDP. Through our simulations, we also find that adding noise will make the data less usable, but increase the level of privacy protection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12387v2-abstract-full').style.display = 'none'; document.getElementById('2412.12387v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08907">arXiv:2412.08907</a> <span> [<a href="https://arxiv.org/pdf/2412.08907">pdf</a>, <a href="https://arxiv.org/format/2412.08907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GaGA: Towards Interactive Global Geolocation Assistant </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xumeng Han</a>, <a href="/search/cs?searchtype=author&query=Qiang%2C+C">Chenhui Qiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kuiran Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guorong Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhibei Huang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhenjun Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08907v1-abstract-short" style="display: inline;"> Global geolocation, which seeks to predict the geographical location of images captured anywhere in the world, is one of the most challenging tasks in the field of computer vision. In this paper, we introduce an innovative interactive global geolocation assistant named GaGA, built upon the flourishing large vision-language models (LVLMs). GaGA uncovers geographical clues within images and combines… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08907v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08907v1-abstract-full" style="display: none;"> Global geolocation, which seeks to predict the geographical location of images captured anywhere in the world, is one of the most challenging tasks in the field of computer vision. In this paper, we introduce an innovative interactive global geolocation assistant named GaGA, built upon the flourishing large vision-language models (LVLMs). GaGA uncovers geographical clues within images and combines them with the extensive world knowledge embedded in LVLMs to determine the geolocations while also providing justifications and explanations for the prediction results. We further designed a novel interactive geolocation method that surpasses traditional static inference approaches. It allows users to intervene, correct, or provide clues for the predictions, making the model more flexible and practical. The development of GaGA relies on the newly proposed Multi-modal Global Geolocation (MG-Geo) dataset, a comprehensive collection of 5 million high-quality image-text pairs. GaGA achieves state-of-the-art performance on the GWS15k dataset, improving accuracy by 4.57% at the country level and 2.92% at the city level, setting a new benchmark. These advancements represent a significant leap forward in developing highly accurate, interactive geolocation systems with global applicability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08907v1-abstract-full').style.display = 'none'; document.getElementById('2412.08907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08875">arXiv:2412.08875</a> <span> [<a href="https://arxiv.org/pdf/2412.08875">pdf</a>, <a href="https://arxiv.org/format/2412.08875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Brain-inspired AI Agent: The Way Towards AGI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bo Yu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jiangning Wei</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Minzhen Hu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zejie Han</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+T">Tianjian Zou</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Ye He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08875v1-abstract-short" style="display: inline;"> Artificial General Intelligence (AGI), widely regarded as the fundamental goal of artificial intelligence, represents the realization of cognitive capabilities that enable the handling of general tasks with human-like proficiency. Researchers in brain-inspired AI seek inspiration from the operational mechanisms of the human brain, aiming to replicate its functional rules in intelligent models. Mor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08875v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08875v1-abstract-full" style="display: none;"> Artificial General Intelligence (AGI), widely regarded as the fundamental goal of artificial intelligence, represents the realization of cognitive capabilities that enable the handling of general tasks with human-like proficiency. Researchers in brain-inspired AI seek inspiration from the operational mechanisms of the human brain, aiming to replicate its functional rules in intelligent models. Moreover, with the rapid development of large-scale models in recent years, the concept of agents has garnered increasing attention, with researchers widely recognizing it as a necessary pathway toward achieving AGI. In this article, we propose the concept of a brain-inspired AI agent and analyze how to extract relatively feasible and agent-compatible cortical region functionalities and their associated functional connectivity networks from the complex mechanisms of the human brain. Implementing these structures within an agent enables it to achieve basic cognitive intelligence akin to human capabilities. Finally, we explore the limitations and challenges for realizing brain-inspired agents and discuss their future development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08875v1-abstract-full').style.display = 'none'; document.getElementById('2412.08875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08149">arXiv:2412.08149</a> <span> [<a href="https://arxiv.org/pdf/2412.08149">pdf</a>, <a href="https://arxiv.org/format/2412.08149">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AsyncDSB: Schedule-Asynchronous Diffusion Schr枚dinger Bridge for Image Inpainting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zihao Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Baoquan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lisai Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shanshan Feng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kenghong Lin</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+G">Guotao Liang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yunming Ye</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xiaochen Qi</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+G">Guangming Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08149v1-abstract-short" style="display: inline;"> Image inpainting is an important image generation task, which aims to restore corrupted image from partial visible area. Recently, diffusion Schr枚dinger bridge methods effectively tackle this task by modeling the translation between corrupted and target images as a diffusion Schr枚dinger bridge process along a noising schedule path. Although these methods have shown superior performance, in this pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08149v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08149v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08149v1-abstract-full" style="display: none;"> Image inpainting is an important image generation task, which aims to restore corrupted image from partial visible area. Recently, diffusion Schr枚dinger bridge methods effectively tackle this task by modeling the translation between corrupted and target images as a diffusion Schr枚dinger bridge process along a noising schedule path. Although these methods have shown superior performance, in this paper, we find that 1) existing methods suffer from a schedule-restoration mismatching issue, i.e., the theoretical schedule and practical restoration processes usually exist a large discrepancy, which theoretically results in the schedule not fully leveraged for restoring images; and 2) the key reason causing such issue is that the restoration process of all pixels are actually asynchronous but existing methods set a synchronous noise schedule to them, i.e., all pixels shares the same noise schedule. To this end, we propose a schedule-Asynchronous Diffusion Schr枚dinger Bridge (AsyncDSB) for image inpainting. Our insight is preferentially scheduling pixels with high frequency (i.e., large gradients) and then low frequency (i.e., small gradients). Based on this insight, given a corrupted image, we first train a network to predict its gradient map in corrupted area. Then, we regard the predicted image gradient as prior and design a simple yet effective pixel-asynchronous noise schedule strategy to enhance the diffusion Schr枚dinger bridge. Thanks to the asynchronous schedule at pixels, the temporal interdependence of restoration process between pixels can be fully characterized for high-quality image inpainting. Experiments on real-world datasets show that our AsyncDSB achieves superior performance, especially on FID with around 3% - 14% improvement over state-of-the-art baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08149v1-abstract-full').style.display = 'none'; document.getElementById('2412.08149v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03897">arXiv:2412.03897</a> <span> [<a href="https://arxiv.org/pdf/2412.03897">pdf</a>, <a href="https://arxiv.org/format/2412.03897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TGRS.2024.3478385">10.1109/TGRS.2024.3478385 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Multisource Collaborative Domain Generalization for Cross-Scene Remote Sensing Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Ce Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Lianru Gao</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhiqiang Zeng</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+M+K">Michael K. Ng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bing Zhang</a>, <a href="/search/cs?searchtype=author&query=Chanussot%2C+J">Jocelyn Chanussot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03897v1-abstract-short" style="display: inline;"> Cross-scene image classification aims to transfer prior knowledge of ground materials to annotate regions with different distributions and reduce hand-crafted cost in the field of remote sensing. However, existing approaches focus on single-source domain generalization to unseen target domains, and are easily confused by large real-world domain shifts due to the limited training information and in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03897v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03897v1-abstract-full" style="display: none;"> Cross-scene image classification aims to transfer prior knowledge of ground materials to annotate regions with different distributions and reduce hand-crafted cost in the field of remote sensing. However, existing approaches focus on single-source domain generalization to unseen target domains, and are easily confused by large real-world domain shifts due to the limited training information and insufficient diversity modeling capacity. To address this gap, we propose a novel multi-source collaborative domain generalization framework (MS-CDG) based on homogeneity and heterogeneity characteristics of multi-source remote sensing data, which considers data-aware adversarial augmentation and model-aware multi-level diversification simultaneously to enhance cross-scene generalization performance. The data-aware adversarial augmentation adopts an adversary neural network with semantic guide to generate MS samples by adaptively learning realistic channel and distribution changes across domains. In views of cross-domain and intra-domain modeling, the model-aware diversification transforms the shared spatial-channel features of MS data into the class-wise prototype and kernel mixture module, to address domain discrepancies and cluster different classes effectively. Finally, the joint classification of original and augmented MS samples is employed by introducing a distribution consistency alignment to increase model diversity and ensure better domain-invariant representation learning. Extensive experiments on three public MS remote sensing datasets demonstrate the superior performance of the proposed method when benchmarked with the state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03897v1-abstract-full').style.display = 'none'; document.getElementById('2412.03897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03893">arXiv:2412.03893</a> <span> [<a href="https://arxiv.org/pdf/2412.03893">pdf</a>, <a href="https://arxiv.org/format/2412.03893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TGRS.2024.3418583">10.1109/TGRS.2024.3418583 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Dual-Branch Subpixel-Guided Network for Hyperspectral Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jin Yang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Lianru Gao</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhiqiang Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bing Zhang</a>, <a href="/search/cs?searchtype=author&query=Chanussot%2C+J">Jocelyn Chanussot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03893v1-abstract-short" style="display: inline;"> Deep learning (DL) has been widely applied into hyperspectral image (HSI) classification owing to its promising feature learning and representation capabilities. However, limited by the spatial resolution of sensors, existing DL-based classification approaches mainly focus on pixel-level spectral and spatial information extraction through complex network architecture design, while ignoring the exi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03893v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03893v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03893v1-abstract-full" style="display: none;"> Deep learning (DL) has been widely applied into hyperspectral image (HSI) classification owing to its promising feature learning and representation capabilities. However, limited by the spatial resolution of sensors, existing DL-based classification approaches mainly focus on pixel-level spectral and spatial information extraction through complex network architecture design, while ignoring the existence of mixed pixels in actual scenarios. To tackle this difficulty, we propose a novel dual-branch subpixel-guided network for HSI classification, called DSNet, which automatically integrates subpixel information and convolutional class features by introducing a deep autoencoder unmixing architecture to enhance classification performance. DSNet is capable of fully considering physically nonlinear properties within subpixels and adaptively generating diagnostic abundances in an unsupervised manner to achieve more reliable decision boundaries for class label distributions. The subpixel fusion module is designed to ensure high-quality information fusion across pixel and subpixel features, further promoting stable joint classification. Experimental results on three benchmark datasets demonstrate the effectiveness and superiority of DSNet compared with state-of-the-art DL-based HSI classification approaches. The codes will be available at https://github.com/hanzhu97702/DSNet, contributing to the remote sensing community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03893v1-abstract-full').style.display = 'none'; document.getElementById('2412.03893v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19334">arXiv:2411.19334</a> <span> [<a href="https://arxiv.org/pdf/2411.19334">pdf</a>, <a href="https://arxiv.org/format/2411.19334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Reconfigurable Holographic Surface: A New Paradigm for Ultra-Massive MIMO </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Di%2C+B">Boya Di</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Lingyang Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19334v1-abstract-short" style="display: inline;"> Evolving from massive multiple-input multiple-output (MIMO) in current 5G communications, ultra-massive MIMO emerges as a seminal technology for fulfilling more stringent requirements of future 6G communications. However, widely-utilized phased arrays relying on active components make the implementation of ultra-massive MIMO in practice increasingly prohibitive from both cost and power consumption… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19334v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19334v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19334v1-abstract-full" style="display: none;"> Evolving from massive multiple-input multiple-output (MIMO) in current 5G communications, ultra-massive MIMO emerges as a seminal technology for fulfilling more stringent requirements of future 6G communications. However, widely-utilized phased arrays relying on active components make the implementation of ultra-massive MIMO in practice increasingly prohibitive from both cost and power consumption perspectives. In contrast, the development of reconfigurable holographic surface (RHS) provides a new paradigm to solve the above issue without the need of costly hardware components. By leveraging the holographic principle, the RHS serves as an ultra-thin and lightweight surface antenna integrated with the transceiver, which is a promising alternative to phased arrays for realizing ultra-massive MIMO. In this paper, we provide a comprehensive overview of the RHS, especially the RHS-aided communication and sensing. We first describe the basic concepts of RHS, and introduce its working principle and unique practical constraints. Moreover, we show how to utilize the RHS to achieve cost-efficient and high-performance wireless communication and sensing, and introduce the key technologies. In particular, we present the implementation of RHS with a wireless communication prototype, and report the experimental measurement results based on it. Finally, we outline some open challenges and potential future directions in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19334v1-abstract-full').style.display = 'none'; document.getElementById('2411.19334v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16575">arXiv:2411.16575</a> <span> [<a href="https://arxiv.org/pdf/2411.16575">pdf</a>, <a href="https://arxiv.org/format/2411.16575">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Diffusion for Text-Driven Human Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+Z">Zichong Meng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yiming Xie</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiaogang Peng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zeyu Han</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Huaizu Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16575v1-abstract-short" style="display: inline;"> Since 2023, Vector Quantization (VQ)-based discrete generation methods have rapidly dominated human motion generation, primarily surpassing diffusion-based continuous generation methods in standard performance metrics. However, VQ-based methods have inherent limitations. Representing continuous motion data as limited discrete tokens leads to inevitable information loss, reduces the diversity of ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16575v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16575v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16575v1-abstract-full" style="display: none;"> Since 2023, Vector Quantization (VQ)-based discrete generation methods have rapidly dominated human motion generation, primarily surpassing diffusion-based continuous generation methods in standard performance metrics. However, VQ-based methods have inherent limitations. Representing continuous motion data as limited discrete tokens leads to inevitable information loss, reduces the diversity of generated motions, and restricts their ability to function effectively as motion priors or generation guidance. In contrast, the continuous space generation nature of diffusion-based methods makes them well-suited to address these limitations and with even potential for model scalability. In this work, we systematically investigate why current VQ-based methods perform well and explore the limitations of existing diffusion-based methods from the perspective of motion data representation and distribution. Drawing on these insights, we preserve the inherent strengths of a diffusion-based human motion generation model and gradually optimize it with inspiration from VQ-based approaches. Our approach introduces a human motion diffusion model enabled to perform bidirectional masked autoregression, optimized with a reformed data representation and distribution. Additionally, we also propose more robust evaluation methods to fairly assess different-based methods. Extensive experiments on benchmark human motion generation datasets demonstrate that our method excels previous methods and achieves state-of-the-art performances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16575v1-abstract-full').style.display = 'none'; document.getElementById('2411.16575v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15844">arXiv:2411.15844</a> <span> [<a href="https://arxiv.org/pdf/2411.15844">pdf</a>, <a href="https://arxiv.org/ps/2411.15844">ps</a>, <a href="https://arxiv.org/format/2411.15844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Superior Paradigm: A Comparative Study of Source-Free Domain Adaptation and Unsupervised Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fan Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhongyi Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xingbo Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xin Gao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yilong Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15844v1-abstract-short" style="display: inline;"> In domain adaptation, there are two popular paradigms: Unsupervised Domain Adaptation (UDA), which aligns distributions using source data, and Source-Free Domain Adaptation (SFDA), which leverages pre-trained source models without accessing source data. Evaluating the superiority of UDA versus SFDA is an open and timely question with significant implications for deploying adaptive algorithms in pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15844v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15844v1-abstract-full" style="display: none;"> In domain adaptation, there are two popular paradigms: Unsupervised Domain Adaptation (UDA), which aligns distributions using source data, and Source-Free Domain Adaptation (SFDA), which leverages pre-trained source models without accessing source data. Evaluating the superiority of UDA versus SFDA is an open and timely question with significant implications for deploying adaptive algorithms in practical applications. In this study, we demonstrate through predictive coding theory and extensive experiments on multiple benchmark datasets that SFDA generally outperforms UDA in real-world scenarios. Specifically, SFDA offers advantages in time efficiency, storage requirements, targeted learning objectives, reduced risk of negative transfer, and increased robustness against overfitting. Notably, SFDA is particularly effective in mitigating negative transfer when there are substantial distribution discrepancies between source and target domains. Additionally, we introduce a novel data-model fusion scenario, where data sharing among stakeholders varies (e.g., some provide raw data while others provide only models), and reveal that traditional UDA and SFDA methods do not fully exploit their potential in this context. To address this limitation and capitalize on the strengths of SFDA, we propose a novel weight estimation method that effectively integrates available source data into multi-SFDA (MSFDA) approaches, thereby enhancing model performance within this scenario. This work provides a thorough analysis of UDA versus SFDA and advances a practical approach to model adaptation across diverse real-world environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15844v1-abstract-full').style.display = 'none'; document.getElementById('2411.15844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository