CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 673 results for author: <span class="mathjax">Han, Z</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Han%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Han, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Han%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Han, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14049">arXiv:2411.14049</a> <span> [<a href="https://arxiv.org/pdf/2411.14049">pdf</a>, <a href="https://arxiv.org/format/2411.14049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Out-Of-Distribution Detection with Diversification (Provably) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+H">Haiyun Yao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zongbo Han</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Huazhu Fu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xi Peng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghua Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Changqing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14049v1-abstract-short" style="display: inline;"> Out-of-distribution (OOD) detection is crucial for ensuring reliable deployment of machine learning models. Recent advancements focus on utilizing easily accessible auxiliary outliers (e.g., data from the web or other datasets) in training. However, we experimentally reveal that these methods still struggle to generalize their detection capabilities to unknown OOD data, due to the limited diversit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14049v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14049v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14049v1-abstract-full" style="display: none;"> Out-of-distribution (OOD) detection is crucial for ensuring reliable deployment of machine learning models. Recent advancements focus on utilizing easily accessible auxiliary outliers (e.g., data from the web or other datasets) in training. However, we experimentally reveal that these methods still struggle to generalize their detection capabilities to unknown OOD data, due to the limited diversity of the auxiliary outliers collected. Therefore, we thoroughly examine this problem from the generalization perspective and demonstrate that a more diverse set of auxiliary outliers is essential for enhancing the detection capabilities. However, in practice, it is difficult and costly to collect sufficiently diverse auxiliary outlier data. Therefore, we propose a simple yet practical approach with a theoretical guarantee, termed Diversity-induced Mixup for OOD detection (diverseMix), which enhances the diversity of auxiliary outlier set for training in an efficient way. Extensive experiments show that diverseMix achieves superior performance on commonly used and recent challenging large-scale benchmarks, which further confirm the importance of the diversity of auxiliary outliers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14049v1-abstract-full').style.display = 'none'; document.getElementById('2411.14049v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13183">arXiv:2411.13183</a> <span> [<a href="https://arxiv.org/pdf/2411.13183">pdf</a>, <a href="https://arxiv.org/format/2411.13183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Click; Single Object Tracking; Video Object Segmentation; Real-time Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kuiran Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xuehui Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenwen Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guorong Li</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Q">Qixiang Ye</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jianbin Jiao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhenjun Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13183v1-abstract-short" style="display: inline;"> Single object tracking(SOT) relies on precise object bounding box initialization. In this paper, we reconsidered the deficiencies in the current approaches to initializing single object trackers and propose a new paradigm for single object tracking algorithms, ClickTrack, a new paradigm using clicking interaction for real-time scenarios. Moreover, click as an input type inherently lack hierarchica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13183v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13183v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13183v1-abstract-full" style="display: none;"> Single object tracking(SOT) relies on precise object bounding box initialization. In this paper, we reconsidered the deficiencies in the current approaches to initializing single object trackers and propose a new paradigm for single object tracking algorithms, ClickTrack, a new paradigm using clicking interaction for real-time scenarios. Moreover, click as an input type inherently lack hierarchical information. To address ambiguity in certain special scenarios, we designed the Guided Click Refiner(GCR), which accepts point and optional textual information as inputs, transforming the point into the bounding box expected by the operator. The bounding box will be used as input of single object trackers. Experiments on LaSOT and GOT-10k benchmarks show that tracker combined with GCR achieves stable performance in real-time interactive scenarios. Furthermore, we explored the integration of GCR into the Segment Anything model(SAM), significantly reducing ambiguity issues when SAM receives point inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13183v1-abstract-full').style.display = 'none'; document.getElementById('2411.13183v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08212">arXiv:2411.08212</a> <span> [<a href="https://arxiv.org/pdf/2411.08212">pdf</a>, <a href="https://arxiv.org/format/2411.08212">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PERFT: Parameter-Efficient Routed Fine-Tuning for Mixture-of-Expert Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yilun Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yunpu Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuo Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zifeng Ding</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bailan He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhen Han</a>, <a href="/search/cs?searchtype=author&query=Tresp%2C+V">Volker Tresp</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08212v1-abstract-short" style="display: inline;"> The Mixture-of-Experts (MoE) paradigm has emerged as a powerful approach for scaling transformers with improved resource utilization. However, efficiently fine-tuning MoE models remains largely underexplored. Inspired by recent works on Parameter-Efficient Fine-Tuning (PEFT), we present a unified framework for integrating PEFT modules directly into the MoE mechanism. Aligning with the core princip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08212v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08212v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08212v1-abstract-full" style="display: none;"> The Mixture-of-Experts (MoE) paradigm has emerged as a powerful approach for scaling transformers with improved resource utilization. However, efficiently fine-tuning MoE models remains largely underexplored. Inspired by recent works on Parameter-Efficient Fine-Tuning (PEFT), we present a unified framework for integrating PEFT modules directly into the MoE mechanism. Aligning with the core principles and architecture of MoE, our framework encompasses a set of design dimensions including various functional and composition strategies. By combining design choices within our framework, we introduce Parameter-Efficient Routed Fine-Tuning (PERFT) as a flexible and scalable family of PEFT strategies tailored for MoE models. Extensive experiments on adapting OLMoE-1B-7B and Mixtral-8$\times$7B for commonsense and arithmetic reasoning tasks demonstrate the effectiveness, scalability, and intriguing dynamics of PERFT. Additionally, we provide empirical findings for each specific design choice to facilitate better application of MoE and PEFT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08212v1-abstract-full').style.display = 'none'; document.getElementById('2411.08212v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code available via https://anonymous.4open.science/r/PERFT-MoE/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06881">arXiv:2411.06881</a> <span> [<a href="https://arxiv.org/pdf/2411.06881">pdf</a>, <a href="https://arxiv.org/format/2411.06881">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> WassFFed: Wasserstein Fair Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhongxuan Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chaochao Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaolin Zheng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+F">Fei Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuyuan Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06881v1-abstract-short" style="display: inline;"> Federated Learning (FL) employs a training approach to address scenarios where users' data cannot be shared across clients. Achieving fairness in FL is imperative since training data in FL is inherently geographically distributed among diverse user groups. Existing research on fairness predominantly assumes access to the entire training data, making direct transfer to FL challenging. However, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06881v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06881v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06881v1-abstract-full" style="display: none;"> Federated Learning (FL) employs a training approach to address scenarios where users' data cannot be shared across clients. Achieving fairness in FL is imperative since training data in FL is inherently geographically distributed among diverse user groups. Existing research on fairness predominantly assumes access to the entire training data, making direct transfer to FL challenging. However, the limited existing research on fairness in FL does not effectively address two key challenges, i.e., (CH1) Current methods fail to deal with the inconsistency between fair optimization results obtained with surrogate functions and fair classification results. (CH2) Directly aggregating local fair models does not always yield a globally fair model due to non Identical and Independent data Distributions (non-IID) among clients. To address these challenges, we propose a Wasserstein Fair Federated Learning framework, namely WassFFed. To tackle CH1, we ensure that the outputs of local models, rather than the loss calculated with surrogate functions or classification results with a threshold, remain independent of various user groups. To resolve CH2, we employ a Wasserstein barycenter calculation of all local models' outputs for each user group, bringing local model outputs closer to the global output distribution to ensure consistency between the global model and local models. We conduct extensive experiments on three real-world datasets, demonstrating that WassFFed outperforms existing approaches in striking a balance between accuracy and fairness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06881v1-abstract-full').style.display = 'none'; document.getElementById('2411.06881v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to TKDE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01208">arXiv:2411.01208</a> <span> [<a href="https://arxiv.org/pdf/2411.01208">pdf</a>, <a href="https://arxiv.org/format/2411.01208">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MultiPull: Detailing Signed Distance Functions by Pulling Multi-Level Queries at Multi-Step </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Noda%2C+T">Takeshi Noda</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinhai Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Shen Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhizhong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01208v1-abstract-short" style="display: inline;"> Reconstructing a continuous surface from a raw 3D point cloud is a challenging task. Recent methods usually train neural networks to overfit on single point clouds to infer signed distance functions (SDFs). However, neural networks tend to smooth local details due to the lack of ground truth signed distances or normals, which limits the performance of overfitting-based methods in reconstruction ta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01208v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01208v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01208v1-abstract-full" style="display: none;"> Reconstructing a continuous surface from a raw 3D point cloud is a challenging task. Recent methods usually train neural networks to overfit on single point clouds to infer signed distance functions (SDFs). However, neural networks tend to smooth local details due to the lack of ground truth signed distances or normals, which limits the performance of overfitting-based methods in reconstruction tasks. To resolve this issue, we propose a novel method, named MultiPull, to learn multi-scale implicit fields from raw point clouds by optimizing accurate SDFs from coarse to fine. We achieve this by mapping 3D query points into a set of frequency features, which makes it possible to leverage multi-level features during optimization. Meanwhile, we introduce optimization constraints from the perspective of spatial distance and normal consistency, which play a key role in point cloud reconstruction based on multi-scale optimization strategies. Our experiments on widely used object and scene benchmarks demonstrate that our method outperforms the state-of-the-art methods in surface reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01208v1-abstract-full').style.display = 'none'; document.getElementById('2411.01208v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024. Project page: https://takeshie.github.io/MultiPull/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21882">arXiv:2410.21882</a> <span> [<a href="https://arxiv.org/pdf/2410.21882">pdf</a>, <a href="https://arxiv.org/format/2410.21882">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Building Altruistic and Moral AI Agent with Brain-inspired Affective Empathy Mechanisms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+F">Feifei Zhao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hui Feng</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Haibo Tong</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhengqiang Han</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+E">Enmeng Lu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yinqian Sun</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yi Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21882v1-abstract-short" style="display: inline;"> As AI closely interacts with human society, it is crucial to ensure that its decision-making is safe, altruistic, and aligned with human ethical and moral values. However, existing research on embedding ethical and moral considerations into AI remains insufficient, and previous external constraints based on principles and rules are inadequate to provide AI with long-term stability and generalizati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21882v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21882v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21882v1-abstract-full" style="display: none;"> As AI closely interacts with human society, it is crucial to ensure that its decision-making is safe, altruistic, and aligned with human ethical and moral values. However, existing research on embedding ethical and moral considerations into AI remains insufficient, and previous external constraints based on principles and rules are inadequate to provide AI with long-term stability and generalization capabilities. In contrast, the intrinsic altruistic motivation based on empathy is more willing, spontaneous, and robust. Therefore, this paper is dedicated to autonomously driving intelligent agents to acquire morally behaviors through human-like affective empathy mechanisms. We draw inspiration from the neural mechanism of human brain's moral intuitive decision-making, and simulate the mirror neuron system to construct a brain-inspired affective empathy-driven altruistic decision-making model. Here, empathy directly impacts dopamine release to form intrinsic altruistic motivation. Based on the principle of moral utilitarianism, we design the moral reward function that integrates intrinsic empathy and extrinsic self-task goals. A comprehensive experimental scenario incorporating empathetic processes, personal objectives, and altruistic goals is developed. The proposed model enables the agent to make consistent moral decisions (prioritizing altruism) by balancing self-interest with the well-being of others. We further introduce inhibitory neurons to regulate different levels of empathy and verify the positive correlation between empathy levels and altruistic preferences, yielding conclusions consistent with findings from psychological behavioral experiments. This work provides a feasible solution for the development of ethical AI by leveraging the intrinsic human-like empathy mechanisms, and contributes to the harmonious coexistence between humans and AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21882v1-abstract-full').style.display = 'none'; document.getElementById('2410.21882v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20344">arXiv:2410.20344</a> <span> [<a href="https://arxiv.org/pdf/2410.20344">pdf</a>, <a href="https://arxiv.org/format/2410.20344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning-Assisted Jamming Mitigation with Movable Antenna Array </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiao Tang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yudan Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinxin Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Q">Qinghe Du</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20344v1-abstract-short" style="display: inline;"> This paper reveals the potential of movable antennas in enhancing anti-jamming communication. We consider a legitimate communication link in the presence of multiple jammers and propose deploying a movable antenna array at the receiver to combat jamming attacks. We formulate the problem as a signal-to-interference-plus-noise ratio maximization, by jointly optimizing the receive beamforming and ant… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20344v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20344v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20344v1-abstract-full" style="display: none;"> This paper reveals the potential of movable antennas in enhancing anti-jamming communication. We consider a legitimate communication link in the presence of multiple jammers and propose deploying a movable antenna array at the receiver to combat jamming attacks. We formulate the problem as a signal-to-interference-plus-noise ratio maximization, by jointly optimizing the receive beamforming and antenna element positioning. Due to the non-convexity and multi-fold difficulties from an optimization perspective, we develop a deep learning-based framework where beamforming is tackled as a Rayleigh quotient problem, while antenna positioning is addressed through multi-layer perceptron training. The neural network parameters are optimized using stochastic gradient descent to achieve effective jamming mitigation strategy, featuring offline training with marginal complexity for online inference. Numerical results demonstrate that the proposed approach achieves near-optimal anti-jamming performance thereby significantly improving the efficiency in strategy determination. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20344v1-abstract-full').style.display = 'none'; document.getElementById('2410.20344v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, under consideration</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19680">arXiv:2410.19680</a> <span> [<a href="https://arxiv.org/pdf/2410.19680">pdf</a>, <a href="https://arxiv.org/format/2410.19680">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Inferring Neural Signed Distance Functions by Overfitting on Single Noisy Point Clouds through Finetuning Data-Driven based Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chao Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Shen Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhizhong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19680v1-abstract-short" style="display: inline;"> It is important to estimate an accurate signed distance function (SDF) from a point cloud in many computer vision applications. The latest methods learn neural SDFs using either a data-driven based or an overfitting-based strategy. However, these two kinds of methods are with either poor generalization or slow convergence, which limits their capability under challenging scenarios like highly noisy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19680v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19680v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19680v1-abstract-full" style="display: none;"> It is important to estimate an accurate signed distance function (SDF) from a point cloud in many computer vision applications. The latest methods learn neural SDFs using either a data-driven based or an overfitting-based strategy. However, these two kinds of methods are with either poor generalization or slow convergence, which limits their capability under challenging scenarios like highly noisy point clouds. To resolve this issue, we propose a method to promote pros of both data-driven based and overfitting-based methods for better generalization, faster inference, and higher accuracy in learning neural SDFs. We introduce a novel statistical reasoning algorithm in local regions which is able to finetune data-driven based priors without signed distance supervision, clean point cloud, or point normals. This helps our method start with a good initialization, and converge to a minimum in a much faster way. Our numerical and visual comparisons with the state-of-the-art methods show our superiority over these methods in surface reconstruction and point cloud denoising on widely used shape and scene benchmarks. The code is available at https://github.com/chenchao15/LocalN2NM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19680v1-abstract-full').style.display = 'none'; document.getElementById('2410.19680v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurlPS 2024. Project page: https://chenchao15.github.io/LocalN2NM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19665">arXiv:2410.19665</a> <span> [<a href="https://arxiv.org/pdf/2410.19665">pdf</a>, <a href="https://arxiv.org/format/2410.19665">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> MetaTrading: An Immersion-Aware Model Trading Framework for Vehicular Metaverse Services </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hongjia Wu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+H">Hui Zeng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zhiping Cai</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+T">Tse-Tin Chan</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19665v1-abstract-short" style="display: inline;"> Updates of extensive Internet of Things (IoT) data are critical to the immersion of vehicular metaverse services. However, providing high-quality and sustainable data in unstable and resource-constrained vehicular networks remains a significant challenge. To address this problem, we put forth a novel immersion-aware model trading framework that incentivizes metaverse users (MUs) to contribute lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19665v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19665v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19665v1-abstract-full" style="display: none;"> Updates of extensive Internet of Things (IoT) data are critical to the immersion of vehicular metaverse services. However, providing high-quality and sustainable data in unstable and resource-constrained vehicular networks remains a significant challenge. To address this problem, we put forth a novel immersion-aware model trading framework that incentivizes metaverse users (MUs) to contribute learning models trained by their latest local data for augmented reality (AR) services in the vehicular metaverse, while preserving their privacy through federated learning. To comprehensively evaluate the contribution of locally trained learning models provided by MUs to AR services, we design a new immersion metric that captures service immersion by considering the freshness and accuracy of learning models, as well as the amount and potential value of raw data used for training. We model the trading interactions between metaverse service providers (MSPs) and MUs as an equilibrium problem with equilibrium constraints (EPEC) to analyze and balance their costs and gains. Moreover, considering dynamic network conditions and privacy concerns, we formulate the reward decisions of MSPs as a multi-agent Markov decision process. Then, a fully distributed dynamic reward method based on deep reinforcement learning is presented, which operates without any private information about MUs and other MSPs. Experimental results demonstrate that the proposed framework can effectively provide higher-value models for object detection and classification in AR services on real AR-related vehicle datasets compared to benchmark schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19665v1-abstract-full').style.display = 'none'; document.getElementById('2410.19665v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18822">arXiv:2410.18822</a> <span> [<a href="https://arxiv.org/pdf/2410.18822">pdf</a>, <a href="https://arxiv.org/format/2410.18822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Binocular-Guided 3D Gaussian Splatting with View Consistency for Sparse View Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+L">Liang Han</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junsheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Shen Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhizhong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18822v2-abstract-short" style="display: inline;"> Novel view synthesis from sparse inputs is a vital yet challenging task in 3D computer vision. Previous methods explore 3D Gaussian Splatting with neural priors (e.g. depth priors) as an additional supervision, demonstrating promising quality and efficiency compared to the NeRF based methods. However, the neural priors from 2D pretrained models are often noisy and blurry, which struggle to precise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18822v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18822v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18822v2-abstract-full" style="display: none;"> Novel view synthesis from sparse inputs is a vital yet challenging task in 3D computer vision. Previous methods explore 3D Gaussian Splatting with neural priors (e.g. depth priors) as an additional supervision, demonstrating promising quality and efficiency compared to the NeRF based methods. However, the neural priors from 2D pretrained models are often noisy and blurry, which struggle to precisely guide the learning of radiance fields. In this paper, We propose a novel method for synthesizing novel views from sparse views with Gaussian Splatting that does not require external prior as supervision. Our key idea lies in exploring the self-supervisions inherent in the binocular stereo consistency between each pair of binocular images constructed with disparity-guided image warping. To this end, we additionally introduce a Gaussian opacity constraint which regularizes the Gaussian locations and avoids Gaussian redundancy for improving the robustness and efficiency of inferring 3D Gaussians from sparse views. Extensive experiments on the LLFF, DTU, and Blender datasets demonstrate that our method significantly outperforms the state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18822v2-abstract-full').style.display = 'none'; document.getElementById('2410.18822v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024. Project page: https://hanl2010.github.io/Binocular3DGS/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15971">arXiv:2410.15971</a> <span> [<a href="https://arxiv.org/pdf/2410.15971">pdf</a>, <a href="https://arxiv.org/format/2410.15971">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Zero-Shot Scene Reconstruction from Single Images with Deep Prior Assembly </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junsheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Shen Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhizhong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15971v1-abstract-short" style="display: inline;"> Large language and vision models have been leading a revolution in visual computing. By greatly scaling up sizes of data and model parameters, the large models learn deep priors which lead to remarkable performance in various tasks. In this work, we present deep prior assembly, a novel framework that assembles diverse deep priors from large models for scene reconstruction from single images in a z… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15971v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15971v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15971v1-abstract-full" style="display: none;"> Large language and vision models have been leading a revolution in visual computing. By greatly scaling up sizes of data and model parameters, the large models learn deep priors which lead to remarkable performance in various tasks. In this work, we present deep prior assembly, a novel framework that assembles diverse deep priors from large models for scene reconstruction from single images in a zero-shot manner. We show that this challenging task can be done without extra knowledge but just simply generalizing one deep prior in one sub-task. To this end, we introduce novel methods related to poses, scales, and occlusion parsing which are keys to enable deep priors to work together in a robust way. Deep prior assembly does not require any 3D or 2D data-driven training in the task and demonstrates superior performance in generalizing priors to open-world scenes. We conduct evaluations on various datasets, and report analysis, numerical and visual comparisons with the latest methods to show our superiority. Project page: https://junshengzhou.github.io/DeepPriorAssembly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15971v1-abstract-full').style.display = 'none'; document.getElementById('2410.15971v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at NeurIPS 2024. Project page: https://junshengzhou.github.io/DeepPriorAssembly</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15732">arXiv:2410.15732</a> <span> [<a href="https://arxiv.org/pdf/2410.15732">pdf</a>, <a href="https://arxiv.org/format/2410.15732">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ViMoE: An Empirical Study of Designing Vision Mixture-of-Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xumeng Han</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+L">Longhui Wei</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Qiang%2C+C">Chenhui Qiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xin He</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yingfei Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhenjun Han</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qi Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15732v1-abstract-short" style="display: inline;"> Mixture-of-Experts (MoE) models embody the divide-and-conquer concept and are a promising approach for increasing model capacity, demonstrating excellent scalability across multiple domains. In this paper, we integrate the MoE structure into the classic Vision Transformer (ViT), naming it ViMoE, and explore the potential of applying MoE to vision through a comprehensive study on image classificati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15732v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15732v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15732v1-abstract-full" style="display: none;"> Mixture-of-Experts (MoE) models embody the divide-and-conquer concept and are a promising approach for increasing model capacity, demonstrating excellent scalability across multiple domains. In this paper, we integrate the MoE structure into the classic Vision Transformer (ViT), naming it ViMoE, and explore the potential of applying MoE to vision through a comprehensive study on image classification. However, we observe that the performance is sensitive to the configuration of MoE layers, making it challenging to obtain optimal results without careful design. The underlying cause is that inappropriate MoE layers lead to unreliable routing and hinder experts from effectively acquiring helpful knowledge. To address this, we introduce a shared expert to learn and capture common information, serving as an effective way to construct stable ViMoE. Furthermore, we demonstrate how to analyze expert routing behavior, revealing which MoE layers are capable of specializing in handling specific information and which are not. This provides guidance for retaining the critical layers while removing redundancies, thereby advancing ViMoE to be more efficient without sacrificing accuracy. We aspire for this work to offer new insights into the design of vision MoE models and provide valuable empirical guidance for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15732v1-abstract-full').style.display = 'none'; document.getElementById('2410.15732v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14189">arXiv:2410.14189</a> <span> [<a href="https://arxiv.org/pdf/2410.14189">pdf</a>, <a href="https://arxiv.org/format/2410.14189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neural Signed Distance Function Inference through Splatting 3D Gaussians Pulled on Zero-Level Set </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Shen Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhizhong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14189v1-abstract-short" style="display: inline;"> It is vital to infer a signed distance function (SDF) in multi-view based surface reconstruction. 3D Gaussian splatting (3DGS) provides a novel perspective for volume rendering, and shows advantages in rendering efficiency and quality. Although 3DGS provides a promising neural rendering option, it is still hard to infer SDFs for surface reconstruction with 3DGS due to the discreteness, the sparsen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14189v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14189v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14189v1-abstract-full" style="display: none;"> It is vital to infer a signed distance function (SDF) in multi-view based surface reconstruction. 3D Gaussian splatting (3DGS) provides a novel perspective for volume rendering, and shows advantages in rendering efficiency and quality. Although 3DGS provides a promising neural rendering option, it is still hard to infer SDFs for surface reconstruction with 3DGS due to the discreteness, the sparseness, and the off-surface drift of 3D Gaussians. To resolve these issues, we propose a method that seamlessly merge 3DGS with the learning of neural SDFs. Our key idea is to more effectively constrain the SDF inference with the multi-view consistency. To this end, we dynamically align 3D Gaussians on the zero-level set of the neural SDF using neural pulling, and then render the aligned 3D Gaussians through the differentiable rasterization. Meanwhile, we update the neural SDF by pulling neighboring space to the pulled 3D Gaussians, which progressively refine the signed distance field near the surface. With both differentiable pulling and splatting, we jointly optimize 3D Gaussians and the neural SDF with both RGB and geometry constraints, which recovers more accurate, smooth, and complete surfaces with more geometry details. Our numerical and visual comparisons show our superiority over the state-of-the-art results on the widely used benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14189v1-abstract-full').style.display = 'none'; document.getElementById('2410.14189v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024. Project page: https://wen-yuan-zhang.github.io/GS-Pull/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13602">arXiv:2410.13602</a> <span> [<a href="https://arxiv.org/pdf/2410.13602">pdf</a>, <a href="https://arxiv.org/format/2410.13602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Satellite Non-IID Imagery: A Spectral Clustering-Assisted Federated Learning Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+L">Luyao Zou</a>, <a href="/search/cs?searchtype=author&query=Park%2C+Y+M">Yu Min Park</a>, <a href="/search/cs?searchtype=author&query=Thwal%2C+C+M">Chu Myaet Thwal</a>, <a href="/search/cs?searchtype=author&query=Tun%2C+Y+K">Yan Kyaw Tun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C+S">Choong Seon Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13602v2-abstract-short" style="display: inline;"> Low Earth orbit (LEO) satellites are capable of gathering abundant Earth observation data (EOD) to enable different Internet of Things (IoT) applications. However, to accomplish an effective EOD processing mechanism, it is imperative to investigate: 1) the challenge of processing the observed data without transmitting those large-size data to the ground because the connection between the satellite… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13602v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13602v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13602v2-abstract-full" style="display: none;"> Low Earth orbit (LEO) satellites are capable of gathering abundant Earth observation data (EOD) to enable different Internet of Things (IoT) applications. However, to accomplish an effective EOD processing mechanism, it is imperative to investigate: 1) the challenge of processing the observed data without transmitting those large-size data to the ground because the connection between the satellites and the ground stations is intermittent, and 2) the challenge of processing the non-independent and identically distributed (non-IID) satellite data. In this paper, to cope with those challenges, we propose an orbit-based spectral clustering-assisted clustered federated self-knowledge distillation (OSC-FSKD) approach for each orbit of an LEO satellite constellation, which retains the advantage of FL that the observed data does not need to be sent to the ground. Specifically, we introduce normalized Laplacian-based spectral clustering (NLSC) into federated learning (FL) to create clustered FL in each round to address the challenge resulting from non-IID data. Particularly, NLSC is adopted to dynamically group clients into several clusters based on cosine similarities calculated by model updates. In addition, self-knowledge distillation is utilized to construct each local client, where the most recent updated local model is used to guide current local model training. Experiments demonstrate that the observation accuracy obtained by the proposed method is separately 1.01x, 2.15x, 1.10x, and 1.03x higher than that of pFedSD, FedProx, FedAU, and FedALA approaches using the SAT4 dataset. The proposed method also shows superiority when using other datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13602v2-abstract-full').style.display = 'none'; document.getElementById('2410.13602v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12246">arXiv:2410.12246</a> <span> [<a href="https://arxiv.org/pdf/2410.12246">pdf</a>, <a href="https://arxiv.org/format/2410.12246">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Transmission Scheduling of Millimeter Wave Communication for High-Speed Railway in Space-Air-Ground Integrated Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lei Liu</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+B">Bo Ai</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yong Niu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Ning Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+L">Lei Xiong</a>, <a href="/search/cs?searchtype=author&query=He%2C+R">Ruisi He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12246v1-abstract-short" style="display: inline;"> The space-air-ground integrated network (SAGIN) greatly improves coverage and reliability for millimeter-wave (mmWave) communication in high-speed railway (HSR) scenarios. However, a significant challenge arises in the transmission scheduling due to the rapid changes in channel state, link selection for train mobile relays (MRs), and order of the flow scheduling. To tackle this challenge, we intro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12246v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12246v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12246v1-abstract-full" style="display: none;"> The space-air-ground integrated network (SAGIN) greatly improves coverage and reliability for millimeter-wave (mmWave) communication in high-speed railway (HSR) scenarios. However, a significant challenge arises in the transmission scheduling due to the rapid changes in channel state, link selection for train mobile relays (MRs), and order of the flow scheduling. To tackle this challenge, we introduce an optimization problem focused on maximizing the weighted sum completed flows that satisfy the quality of service (QoS) requirements for HSR mmWave communication in SAGIN. To facilitate the simultaneous scheduling of flows by base station-MR (BS-MR), satellite-airship-MR, and satellite-MR links, we propose a link selection algorithm, which can help each flow choose a suitable set of links in every frame and determine whether the BS networks need the assistance of the satellite and airship. Furthermore, taking into account the priority and occupied time slots (TSs) resource of different flows, we propose a multi-link weighted flow scheduling (MWFS) algorithm. This algorithm not only prioritizes scheduling high-priority flows but also aims to maximize the weighted sum completed flows for MRs. Our simulation results confirm that the proposed algorithm significantly increases the weighted sum completed flows and the total transmitted bits. Additionally, the proposed algorithm can achieve the optimal flow transmission in different link switching periods and enhance the scheduling of high-priority flows compared to other algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12246v1-abstract-full').style.display = 'none'; document.getElementById('2410.12246v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 15 figures, IEEE Transactions on Vehicular Technology</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04581">arXiv:2410.04581</a> <span> [<a href="https://arxiv.org/pdf/2410.04581">pdf</a>, <a href="https://arxiv.org/format/2410.04581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Efficient Linearizability Monitoring for Sets, Stacks, Queues and Priority Queues </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+L+Z">Lee Zheng Han</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+U">Umang Mathur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04581v1-abstract-short" style="display: inline;"> In this paper, we consider the problem of automatically monitoring linearizability. Here, one observes an execution of a concurrent program that interacts with a concurrent object and determines if the execution witnesses the violation of linearizability with respect to the sequential specification of the underlying data structure of the concurrent object. This problem has been extensively studied… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04581v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04581v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04581v1-abstract-full" style="display: none;"> In this paper, we consider the problem of automatically monitoring linearizability. Here, one observes an execution of a concurrent program that interacts with a concurrent object and determines if the execution witnesses the violation of linearizability with respect to the sequential specification of the underlying data structure of the concurrent object. This problem has been extensively studied in the past for read-write registers, and both tight upper and lower bounds have been proposed in this case. While this problem has also been studied for the case of other prominent data structures such as stacks and queues, we find that these results are either not extensive or in some cases incorrect. In this paper, we study the problem under the restriction where values inserted in the data types are distinct (in the execution observed). We then show that under such a restriction, the linearizability problem is solvable in polynomial time for these data types. Beyond theoretical soundness and completeness, the algorithms proposed are empirically proven to outperform all state-of-the-art linearizability monitors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04581v1-abstract-full').style.display = 'none'; document.getElementById('2410.04581v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02796">arXiv:2410.02796</a> <span> [<a href="https://arxiv.org/pdf/2410.02796">pdf</a>, <a href="https://arxiv.org/format/2410.02796">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Toward Adaptive Tracking and Communication via an Airborne Maneuverable Bi-Static ISAC System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+M">Mingliang Wei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoguang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Li Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Lianming Xu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02796v1-abstract-short" style="display: inline;"> In this letter, we propose an airborne maneuverable bi-static integrated sensing and communication system where both the transmitter and receiver are unmanned aerial vehicles. By timely forming a dynamic bi-static range based on the motion information of the target, such a system can provide an adaptive two dimensional tracking and communication services. Towards this end, a trajectory optimizatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02796v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02796v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02796v1-abstract-full" style="display: none;"> In this letter, we propose an airborne maneuverable bi-static integrated sensing and communication system where both the transmitter and receiver are unmanned aerial vehicles. By timely forming a dynamic bi-static range based on the motion information of the target, such a system can provide an adaptive two dimensional tracking and communication services. Towards this end, a trajectory optimization problem for both transmits and receive UAV is formulated to achieve high-accurate motion state estimation by minimizing the time-variant Cramer Rao bound, subject to the sufficient communication signal-to-noise ratio to maintain communication channel prediction error. Then we develop an efficient approach based on the successive convex approximation technique and the S-procedure to address the problem. Numerical results demonstrate that our proposed airborne maneuverable bi-static ISAC system is able to obtain higher tracking accuracy compared with the static or semi-dynamic ISAC system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02796v1-abstract-full').style.display = 'none'; document.getElementById('2410.02796v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02122">arXiv:2410.02122</a> <span> [<a href="https://arxiv.org/pdf/2410.02122">pdf</a>, <a href="https://arxiv.org/ps/2410.02122">ps</a>, <a href="https://arxiv.org/format/2410.02122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Resource Allocation Based on Optimal Transport Theory in ISAC-Enabled Multi-UAV Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yufeng Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lixin Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wensheng Lin</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wei Liang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Q">Qinghe Du</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02122v1-abstract-short" style="display: inline;"> This paper investigates the resource allocation optimization for cooperative communication with non-cooperative localization in integrated sensing and communications (ISAC)-enabled multi-unmanned aerial vehicle (UAV) cooperative networks. Our goal is to maximize the weighted sum of the system's average sum rate and the localization quality of service (QoS) by jointly optimizing cell association, c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02122v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02122v1-abstract-full" style="display: none;"> This paper investigates the resource allocation optimization for cooperative communication with non-cooperative localization in integrated sensing and communications (ISAC)-enabled multi-unmanned aerial vehicle (UAV) cooperative networks. Our goal is to maximize the weighted sum of the system's average sum rate and the localization quality of service (QoS) by jointly optimizing cell association, communication power allocation, and sensing power allocation. Since the formulated problem is a mixed-integer nonconvex problem, we propose the alternating iteration algorithm based on optimal transport theory (AIBOT) to solve the optimization problem more effectively. Simulation results demonstrate that the AIBOT can improve the system sum rate by nearly 12% and reduce the localization Cr'amer-Rao bound (CRB) by almost 29% compared to benchmark algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02122v1-abstract-full').style.display = 'none'; document.getElementById('2410.02122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02121">arXiv:2410.02121</a> <span> [<a href="https://arxiv.org/pdf/2410.02121">pdf</a>, <a href="https://arxiv.org/format/2410.02121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> SC-CDM: Enhancing Quality of Image Semantic Communication with a Compact Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kexin Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lixin Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wensheng Lin</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuna Yan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wenchi Cheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02121v1-abstract-short" style="display: inline;"> Semantic Communication (SC) is an emerging technology that has attracted much attention in the sixth-generation (6G) mobile communication systems. However, few literature has fully considered the perceptual quality of the reconstructed image. To solve this problem, we propose a generative SC for wireless image transmission (denoted as SC-CDM). This approach leverages compact diffusion models to im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02121v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02121v1-abstract-full" style="display: none;"> Semantic Communication (SC) is an emerging technology that has attracted much attention in the sixth-generation (6G) mobile communication systems. However, few literature has fully considered the perceptual quality of the reconstructed image. To solve this problem, we propose a generative SC for wireless image transmission (denoted as SC-CDM). This approach leverages compact diffusion models to improve the fidelity and semantic accuracy of the images reconstructed after transmission, ensuring that the essential content is preserved even in bandwidth-constrained environments. Specifically, we aim to redesign the swin Transformer as a new backbone for efficient semantic feature extraction and compression. Next, the receiver integrates the slim prior and image reconstruction networks. Compared to traditional Diffusion Models (DMs), it leverages DMs' robust distribution mapping capability to generate a compact condition vector, guiding image recovery, thus enhancing the perceptual details of the reconstructed images. Finally, a series of evaluation and ablation studies are conducted to validate the effectiveness and robustness of the proposed algorithm and further increase the Peak Signal-to-Noise Ratio (PSNR) by over 17% on top of CNN-based DeepJSCC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02121v1-abstract-full').style.display = 'none'; document.getElementById('2410.02121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2408.05112</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02120">arXiv:2410.02120</a> <span> [<a href="https://arxiv.org/pdf/2410.02120">pdf</a>, <a href="https://arxiv.org/ps/2410.02120">ps</a>, <a href="https://arxiv.org/format/2410.02120">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Lossy Cooperative UAV Relaying Networks: Outage Probability Analysis and Location Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lian%2C+Y">Ya Lian</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wensheng Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lixin Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fucheng Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Matsumoto%2C+T">Tad Matsumoto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02120v1-abstract-short" style="display: inline;"> In this paper, performance of a lossy cooperative unmanned aerial vehicle (UAV) relay communication system is analyzed. In this system, the UAV relay adopts lossy forward (LF) strategy and the receiver has certain distortion requirements for the received information. For the system described above, we first derive the achievable rate distortion region of the system. Then, on the basis of the regio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02120v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02120v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02120v1-abstract-full" style="display: none;"> In this paper, performance of a lossy cooperative unmanned aerial vehicle (UAV) relay communication system is analyzed. In this system, the UAV relay adopts lossy forward (LF) strategy and the receiver has certain distortion requirements for the received information. For the system described above, we first derive the achievable rate distortion region of the system. Then, on the basis of the region analysis, the system outage probability when the channel suffers Nakagami-$m$ fading is analyzed. Finally, we design an optimal relay position identification algorithm based on the Soft Actor-Critic (SAC) algorithm, which determines the optimal UAV position to minimize the outage probability. The simulation results show that the proposed algorithm can optimize the UAV position and reduce the system outage probability effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02120v1-abstract-full').style.display = 'none'; document.getElementById('2410.02120v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01603">arXiv:2410.01603</a> <span> [<a href="https://arxiv.org/pdf/2410.01603">pdf</a>, <a href="https://arxiv.org/format/2410.01603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Beamforming in Secure Integrated Sensing and Communication Systems with Antenna Allocation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yunxiang Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lixin Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wensheng Lin</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wei Liang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01603v1-abstract-short" style="display: inline;"> In this paper, we consider joint antenna allocation and transmit beamforming design in secure integrated sensing and communication (ISAC) systems. A dual-function base station (DFBS) aims to securely deliver messages to a single-antenna receiver while detecting potential eavesdroppers. To prevent eavesdropping, we incorporate specialized sensing signals, intentionally reducing communication signal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01603v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01603v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01603v1-abstract-full" style="display: none;"> In this paper, we consider joint antenna allocation and transmit beamforming design in secure integrated sensing and communication (ISAC) systems. A dual-function base station (DFBS) aims to securely deliver messages to a single-antenna receiver while detecting potential eavesdroppers. To prevent eavesdropping, we incorporate specialized sensing signals, intentionally reducing communication signal power toward suspicious targets to improve sensing. We prioritize minimizing the matching error between the transmitting and required beampatterns for sensing and communication. Our design optimizes antenna allocation and beamforming at the DFBS, meeting minimum secrecy rate and power constraints. We propose solvers based on alternating optimization for the non-convex design problem. Simulations show that the antenna allocation scheme significantly improves safety performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01603v1-abstract-full').style.display = 'none'; document.getElementById('2410.01603v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01597">arXiv:2410.01597</a> <span> [<a href="https://arxiv.org/pdf/2410.01597">pdf</a>, <a href="https://arxiv.org/format/2410.01597">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> SAFE: Semantic Adaptive Feature Extraction with Rate Control for 6G Wireless Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuna Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lixin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wensheng Lin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wenchi Cheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01597v1-abstract-short" style="display: inline;"> Most current Deep Learning-based Semantic Communication (DeepSC) systems are designed and trained exclusively for particular single-channel conditions, which restricts their adaptability and overall bandwidth utilization. To address this, we propose an innovative Semantic Adaptive Feature Extraction (SAFE) framework, which significantly improves bandwidth efficiency by allowing users to select dif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01597v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01597v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01597v1-abstract-full" style="display: none;"> Most current Deep Learning-based Semantic Communication (DeepSC) systems are designed and trained exclusively for particular single-channel conditions, which restricts their adaptability and overall bandwidth utilization. To address this, we propose an innovative Semantic Adaptive Feature Extraction (SAFE) framework, which significantly improves bandwidth efficiency by allowing users to select different sub-semantic combinations based on their channel conditions. This paper also introduces three advanced learning algorithms to optimize the performance of SAFE framework as a whole. Through a series of simulation experiments, we demonstrate that the SAFE framework can effectively and adaptively extract and transmit semantics under different channel bandwidth conditions, of which effectiveness is verified through objective and subjective quality evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01597v1-abstract-full').style.display = 'none'; document.getElementById('2410.01597v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01564">arXiv:2410.01564</a> <span> [<a href="https://arxiv.org/pdf/2410.01564">pdf</a>, <a href="https://arxiv.org/ps/2410.01564">ps</a>, <a href="https://arxiv.org/format/2410.01564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Outage Probability Analysis for OTFS in Lossy Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wensheng Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lixin Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fucheng Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Matsumoto%2C+T">Tad Matsumoto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01564v1-abstract-short" style="display: inline;"> This paper analyzes the outage probability of orthogonal time frequency space (OTFS) modulation under a lossy communication scenario. First of all, we introduce the channel model and the vector form representation of OTFS this paper uses. Then, we derive an exact expression of the OTFS outage probability in lossy communication scenarios, using Shannon's lossy source-channel separation theorem. Bec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01564v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01564v1-abstract-full" style="display: none;"> This paper analyzes the outage probability of orthogonal time frequency space (OTFS) modulation under a lossy communication scenario. First of all, we introduce the channel model and the vector form representation of OTFS this paper uses. Then, we derive an exact expression of the OTFS outage probability in lossy communication scenarios, using Shannon's lossy source-channel separation theorem. Because the channel is time-varying, calculating the exact outage probability is computationally expensive. Therefore, this paper aims to derive a lower bound of the outage probability, which can relatively easily be calculated. Thus, given the distortion requirement and number of the resolvable paths, we can obtain a performance limit under the optimal condition as a reference. Finally, the experimental results of outage probability are obtained by Monte-Carlo method, and compared with the theoretical results calculated by the closed-from expression of the lower bound. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01564v1-abstract-full').style.display = 'none'; document.getElementById('2410.01564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00086">arXiv:2410.00086</a> <span> [<a href="https://arxiv.org/pdf/2410.00086">pdf</a>, <a href="https://arxiv.org/format/2410.00086">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ACE: All-round Creator and Editor Following Instructions via Diffusion Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhen Han</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zeyinzi Jiang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yulin Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+C">Chaojie Mao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chenwei Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00086v2-abstract-short" style="display: inline;"> Diffusion models have emerged as a powerful generative technology and have been found to be applicable in various scenarios. Most existing foundational diffusion models are primarily designed for text-guided visual generation and do not support multi-modal conditions, which are essential for many visual editing tasks. This limitation prevents these foundational diffusion models from serving as a u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00086v2-abstract-full').style.display = 'inline'; document.getElementById('2410.00086v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00086v2-abstract-full" style="display: none;"> Diffusion models have emerged as a powerful generative technology and have been found to be applicable in various scenarios. Most existing foundational diffusion models are primarily designed for text-guided visual generation and do not support multi-modal conditions, which are essential for many visual editing tasks. This limitation prevents these foundational diffusion models from serving as a unified model in the field of visual generation, like GPT-4 in the natural language processing field. In this work, we propose ACE, an All-round Creator and Editor, which achieves comparable performance compared to those expert models in a wide range of visual generation tasks. To achieve this goal, we first introduce a unified condition format termed Long-context Condition Unit (LCU), and propose a novel Transformer-based diffusion model that uses LCU as input, aiming for joint training across various generation and editing tasks. Furthermore, we propose an efficient data collection approach to address the issue of the absence of available training data. It involves acquiring pairwise images with synthesis-based or clustering-based pipelines and supplying these pairs with accurate textual instructions by leveraging a fine-tuned multi-modal large language model. To comprehensively evaluate the performance of our model, we establish a benchmark of manually annotated pairs data across a variety of visual generation tasks. The extensive experimental results demonstrate the superiority of our model in visual generation fields. Thanks to the all-in-one capabilities of our model, we can easily build a multi-modal chat system that responds to any interactive request for image creation using a single model to serve as the backend, avoiding the cumbersome pipeline typically employed in visual agents. Code and models will be available on the project page: https://ali-vilab.github.io/ace-page/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00086v2-abstract-full').style.display = 'none'; document.getElementById('2410.00086v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19375">arXiv:2409.19375</a> <span> [<a href="https://arxiv.org/pdf/2409.19375">pdf</a>, <a href="https://arxiv.org/format/2409.19375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> DOTA: Distributional Test-Time Adaptation of Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zongbo Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jialong Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junfan Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghua Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qianli Xu</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+M+Z">Mike Zheng Shou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Changqing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19375v1-abstract-short" style="display: inline;"> Vision-language foundation models (e.g., CLIP) have shown remarkable performance across a wide range of tasks. However, deploying these models may be unreliable when significant distribution gaps exist between the training and test data. The training-free test-time dynamic adapter (TDA) is a promising approach to address this issue by storing representative test samples to guide the classification… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19375v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19375v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19375v1-abstract-full" style="display: none;"> Vision-language foundation models (e.g., CLIP) have shown remarkable performance across a wide range of tasks. However, deploying these models may be unreliable when significant distribution gaps exist between the training and test data. The training-free test-time dynamic adapter (TDA) is a promising approach to address this issue by storing representative test samples to guide the classification of subsequent ones. However, TDA only naively maintains a limited number of reference samples in the cache, leading to severe test-time catastrophic forgetting when the cache is updated by dropping samples. In this paper, we propose a simple yet effective method for DistributiOnal Test-time Adaptation (Dota). Instead of naively memorizing representative test samples, Dota continually estimates the distributions of test samples, allowing the model to continually adapt to the deployment environment. The test-time posterior probabilities are then computed using the estimated distributions based on Bayes' theorem for adaptation purposes. To further enhance the adaptability on the uncertain samples, we introduce a new human-in-the-loop paradigm which identifies uncertain samples, collects human-feedback, and incorporates it into the Dota framework. Extensive experiments validate that Dota enables CLIP to continually learn, resulting in a significant improvement compared to current state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19375v1-abstract-full').style.display = 'none'; document.getElementById('2409.19375v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19339">arXiv:2409.19339</a> <span> [<a href="https://arxiv.org/pdf/2409.19339">pdf</a>, <a href="https://arxiv.org/format/2409.19339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Visual Question Decomposition on Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianzhe Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhen Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuo Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bailan He</a>, <a href="/search/cs?searchtype=author&query=Tresp%2C+V">Volker Tresp</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiqiang Xu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jindong Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19339v2-abstract-short" style="display: inline;"> Question decomposition has emerged as an effective strategy for prompting Large Language Models (LLMs) to answer complex questions. However, while existing methods primarily focus on unimodal language models, the question decomposition capability of Multimodal Large Language Models (MLLMs) has yet to be explored. To this end, this paper explores visual question decomposition on MLLMs. Specifically… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19339v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19339v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19339v2-abstract-full" style="display: none;"> Question decomposition has emerged as an effective strategy for prompting Large Language Models (LLMs) to answer complex questions. However, while existing methods primarily focus on unimodal language models, the question decomposition capability of Multimodal Large Language Models (MLLMs) has yet to be explored. To this end, this paper explores visual question decomposition on MLLMs. Specifically, we introduce a systematic evaluation framework including a dataset and several evaluation criteria to assess the quality of the decomposed sub-questions, revealing that existing MLLMs struggle to produce high-quality sub-questions. To address this limitation, we propose a specific finetuning dataset, DecoVQA+, for enhancing the model's question decomposition capability. Aiming at enabling models to perform appropriate selective decomposition, we propose an efficient finetuning pipeline. The finetuning pipeline consists of our proposed dataset and a training objective for selective decomposition. Finetuned MLLMs demonstrate significant improvements in the quality of sub-questions and the policy of selective question decomposition. Additionally, the models also achieve higher accuracy with selective decomposition on VQA benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19339v2-abstract-full').style.display = 'none'; document.getElementById('2409.19339v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18718">arXiv:2409.18718</a> <span> [<a href="https://arxiv.org/pdf/2409.18718">pdf</a>, <a href="https://arxiv.org/format/2409.18718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Spectrum Efficiency in 6G Satellite Networks: A GAIL-Powered Policy Learning via Asynchronous Federated Inverse Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hassan%2C+S+S">Sheikh Salman Hassan</a>, <a href="/search/cs?searchtype=author&query=Park%2C+Y+M">Yu Min Park</a>, <a href="/search/cs?searchtype=author&query=Tun%2C+Y+K">Yan Kyaw Tun</a>, <a href="/search/cs?searchtype=author&query=Saad%2C+W">Walid Saad</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C+S">Choong Seon Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18718v1-abstract-short" style="display: inline;"> In this paper, a novel generative adversarial imitation learning (GAIL)-powered policy learning approach is proposed for optimizing beamforming, spectrum allocation, and remote user equipment (RUE) association in NTNs. Traditional reinforcement learning (RL) methods for wireless network optimization often rely on manually designed reward functions, which can require extensive parameter tuning. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18718v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18718v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18718v1-abstract-full" style="display: none;"> In this paper, a novel generative adversarial imitation learning (GAIL)-powered policy learning approach is proposed for optimizing beamforming, spectrum allocation, and remote user equipment (RUE) association in NTNs. Traditional reinforcement learning (RL) methods for wireless network optimization often rely on manually designed reward functions, which can require extensive parameter tuning. To overcome these limitations, we employ inverse RL (IRL), specifically leveraging the GAIL framework, to automatically learn reward functions without manual design. We augment this framework with an asynchronous federated learning approach, enabling decentralized multi-satellite systems to collaboratively derive optimal policies. The proposed method aims to maximize spectrum efficiency (SE) while meeting minimum information rate requirements for RUEs. To address the non-convex, NP-hard nature of this problem, we combine the many-to-one matching theory with a multi-agent asynchronous federated IRL (MA-AFIRL) framework. This allows agents to learn through asynchronous environmental interactions, improving training efficiency and scalability. The expert policy is generated using the Whale optimization algorithm (WOA), providing data to train the automatic reward function within GAIL. Simulation results show that the proposed MA-AFIRL method outperforms traditional RL approaches, achieving a $14.6\%$ improvement in convergence and reward value. The novel GAIL-driven policy learning establishes a novel benchmark for 6G NTN optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18718v1-abstract-full').style.display = 'none'; document.getElementById('2409.18718v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE Transactions on Mobile Computing (16 pages, 10 figures)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16920">arXiv:2409.16920</a> <span> [<a href="https://arxiv.org/pdf/2409.16920">pdf</a>, <a href="https://arxiv.org/format/2409.16920">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Cross-lingual Speech Emotion Recognition: Humans vs. Self-Supervised Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhichen Han</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+T">Tianqi Geng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hui Feng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiahong Yuan</a>, <a href="/search/cs?searchtype=author&query=Richmond%2C+K">Korin Richmond</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuanchao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16920v1-abstract-short" style="display: inline;"> Utilizing Self-Supervised Learning (SSL) models for Speech Emotion Recognition (SER) has proven effective, yet limited research has explored cross-lingual scenarios. This study presents a comparative analysis between human performance and SSL models, beginning with a layer-wise analysis and an exploration of parameter-efficient fine-tuning strategies in monolingual, cross-lingual, and transfer lea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16920v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16920v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16920v1-abstract-full" style="display: none;"> Utilizing Self-Supervised Learning (SSL) models for Speech Emotion Recognition (SER) has proven effective, yet limited research has explored cross-lingual scenarios. This study presents a comparative analysis between human performance and SSL models, beginning with a layer-wise analysis and an exploration of parameter-efficient fine-tuning strategies in monolingual, cross-lingual, and transfer learning contexts. We further compare the SER ability of models and humans at both utterance- and segment-levels. Additionally, we investigate the impact of dialect on cross-lingual SER through human evaluation. Our findings reveal that models, with appropriate knowledge transfer, can adapt to the target language and achieve performance comparable to native speakers. We also demonstrate the significant effect of dialect on SER for individuals without prior linguistic and paralinguistic background. Moreover, both humans and models exhibit distinct behaviors across different emotions. These results offer new insights into the cross-lingual SER capabilities of SSL models, underscoring both their similarities to and differences from human emotion perception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16920v1-abstract-full').style.display = 'none'; document.getElementById('2409.16920v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16676">arXiv:2409.16676</a> <span> [<a href="https://arxiv.org/pdf/2409.16676">pdf</a>, <a href="https://arxiv.org/format/2409.16676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> An Integrated Machine Learning and Deep Learning Framework for Credit Card Approval Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tong%2C+K">Kejian Tong</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zonglin Han</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yanxin Shen</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Y">Yujian Long</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yijing Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16676v1-abstract-short" style="display: inline;"> Credit scoring is vital in the financial industry, assessing the risk of lending to credit card applicants. Traditional credit scoring methods face challenges with large datasets and data imbalance between creditworthy and non-creditworthy applicants. This paper introduces an advanced machine learning and deep learning framework to improve the accuracy and reliability of credit card approval predi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16676v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16676v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16676v1-abstract-full" style="display: none;"> Credit scoring is vital in the financial industry, assessing the risk of lending to credit card applicants. Traditional credit scoring methods face challenges with large datasets and data imbalance between creditworthy and non-creditworthy applicants. This paper introduces an advanced machine learning and deep learning framework to improve the accuracy and reliability of credit card approval predictions. We utilized extensive datasets of user application records and credit history, implementing a comprehensive preprocessing strategy, feature engineering, and model integration. Our methodology combines neural networks with an ensemble of base models, including logistic regression, support vector machines, k-nearest neighbors, decision trees, random forests, and gradient boosting. The ensemble approach addresses data imbalance using Synthetic Minority Over-sampling Technique (SMOTE) and mitigates overfitting risks. Experimental results show that our integrated model surpasses traditional single-model approaches in precision, recall, F1-score, AUC, and Kappa, providing a robust and scalable solution for credit card approval predictions. This research underscores the potential of advanced machine learning techniques to transform credit risk assessment and financial decision-making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16676v1-abstract-full').style.display = 'none'; document.getElementById('2409.16676v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14726">arXiv:2409.14726</a> <span> [<a href="https://arxiv.org/pdf/2409.14726">pdf</a>, <a href="https://arxiv.org/format/2409.14726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Semantic Communication Enabled 6G-NTN Framework: A Novel Denoising and Gateway Hop Integration Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+L+X">Loc X. Nguyen</a>, <a href="/search/cs?searchtype=author&query=Hassan%2C+S+S">Sheikh Salman Hassan</a>, <a href="/search/cs?searchtype=author&query=Tun%2C+Y+K">Yan Kyaw Tun</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kitae Kim</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C+S">Choong Seon Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14726v1-abstract-short" style="display: inline;"> The sixth-generation (6G) non-terrestrial networks (NTNs) are crucial for real-time monitoring in critical applications like disaster relief. However, limited bandwidth, latency, rain attenuation, long propagation delays, and co-channel interference pose challenges to efficient satellite communication. Therefore, semantic communication (SC) has emerged as a promising solution to improve transmissi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14726v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14726v1-abstract-full" style="display: none;"> The sixth-generation (6G) non-terrestrial networks (NTNs) are crucial for real-time monitoring in critical applications like disaster relief. However, limited bandwidth, latency, rain attenuation, long propagation delays, and co-channel interference pose challenges to efficient satellite communication. Therefore, semantic communication (SC) has emerged as a promising solution to improve transmission efficiency and address these issues. In this paper, we explore the potential of SC as a bandwidth-efficient, latency-minimizing strategy specifically suited to 6G satellite communications. While existing SC methods have demonstrated efficacy in direct satellite-terrestrial transmissions, they encounter limitations in satellite networks due to distortion accumulation across gateway hop-relays. Additionally, certain ground users (GUs) experience poor signal-to-noise ratios (SNR), making direct satellite communication challenging. To address these issues, we propose a novel framework that optimizes gateway hop-relay selection for GUs with low SNR and integrates gateway-based denoising mechanisms to ensure high-quality-of-service (QoS) in satellite-based SC networks. This approach directly mitigates distortion, leading to significant improvements in satellite service performance by delivering customized services tailored to the unique signal conditions of each GU. Our findings represent a critical advancement in reliable and efficient data transmission from the Earth observation satellites, thereby enabling fast and effective responses to urgent events. Simulation results demonstrate that our proposed strategy significantly enhances overall network performance, outperforming conventional methods by offering tailored communication services based on specific GU conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14726v1-abstract-full').style.display = 'none'; document.getElementById('2409.14726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 8 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14688">arXiv:2409.14688</a> <span> [<a href="https://arxiv.org/pdf/2409.14688">pdf</a>, <a href="https://arxiv.org/format/2409.14688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Generalized Control Revision Method for Autonomous Driving Safety </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zehang Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuning Wang</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+T">Tianqi Ke</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zeyu Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaobing Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qing Xu</a>, <a href="/search/cs?searchtype=author&query=Dolan%2C+J+M">John M. Dolan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianqiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14688v1-abstract-short" style="display: inline;"> Safety is one of the most crucial challenges of autonomous driving vehicles, and one solution to guarantee safety is to employ an additional control revision module after the planning backbone. Control Barrier Function (CBF) has been widely used because of its strong mathematical foundation on safety. However, the incompatibility with heterogeneous perception data and incomplete consideration of t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14688v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14688v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14688v1-abstract-full" style="display: none;"> Safety is one of the most crucial challenges of autonomous driving vehicles, and one solution to guarantee safety is to employ an additional control revision module after the planning backbone. Control Barrier Function (CBF) has been widely used because of its strong mathematical foundation on safety. However, the incompatibility with heterogeneous perception data and incomplete consideration of traffic scene elements make existing systems hard to be applied in dynamic and complex real-world scenarios. In this study, we introduce a generalized control revision method for autonomous driving safety, which adopts both vectorized perception and occupancy grid map as inputs and comprehensively models multiple types of traffic scene constraints based on a new proposed barrier function. Traffic elements are integrated into one unified framework, decoupled from specific scenario settings or rules. Experiments on CARLA, SUMO, and OnSite simulator prove that the proposed algorithm could realize safe control revision under complicated scenes, adapting to various planning backbones, road topologies, and risk types. Physical platform validation also verifies the real-world application feasibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14688v1-abstract-full').style.display = 'none'; document.getElementById('2409.14688v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13517">arXiv:2409.13517</a> <span> [<a href="https://arxiv.org/pdf/2409.13517">pdf</a>, <a href="https://arxiv.org/format/2409.13517">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Efficient Entanglement Routing for Satellite-Aerial-Terrestrial Quantum Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yanmin Gong</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Lei Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuanxiong Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13517v1-abstract-short" style="display: inline;"> In the era of 6G and beyond, space-aerial-terrestrial quantum networks (SATQNs) are shaping the future of the global-scale quantum Internet. This paper investigates the collaboration among satellite, aerial, and terrestrial quantum networks to efficiently transmit high-fidelity quantum entanglements over long distances. We begin with a comprehensive overview of existing satellite-, aerial-, and te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13517v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13517v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13517v1-abstract-full" style="display: none;"> In the era of 6G and beyond, space-aerial-terrestrial quantum networks (SATQNs) are shaping the future of the global-scale quantum Internet. This paper investigates the collaboration among satellite, aerial, and terrestrial quantum networks to efficiently transmit high-fidelity quantum entanglements over long distances. We begin with a comprehensive overview of existing satellite-, aerial-, and terrestrial-based quantum networks. Subsequently, we address the entanglement routing problem with the objective of maximizing quantum network throughput by jointly optimizing path selection and entanglement generation rates (PS-EGR). Given that the original problem is formulated as a mixed-integer linear programming (MILP) problem, which is inherently intractable, we propose a Benders' decomposition (BD)-based algorithm to solve the problem efficiently. Numerical results validate the effectiveness of the proposed PS-EGR scheme, offering valuable insights into various optimizable factors within the system. Finally, we discuss the current challenges and propose promising avenues for future research in SATQNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13517v1-abstract-full').style.display = 'none'; document.getElementById('2409.13517v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13508">arXiv:2409.13508</a> <span> [<a href="https://arxiv.org/pdf/2409.13508">pdf</a>, <a href="https://arxiv.org/format/2409.13508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMC.2024.3466857">10.1109/TMC.2024.3466857 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Quantum-Assisted Joint Virtual Network Function Deployment and Maximum Flow Routing for Space Information Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yanmin Gong</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Lei Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuanxiong Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13508v1-abstract-short" style="display: inline;"> Network function virtualization (NFV)-enabled space information network (SIN) has emerged as a promising method to facilitate global coverage and seamless service. This paper proposes a novel NFV-enabled SIN to provide end-to-end communication and computation services for ground users. Based on the multi-functional time expanded graph (MF-TEG), we jointly optimize the user association, virtual net… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13508v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13508v1-abstract-full" style="display: none;"> Network function virtualization (NFV)-enabled space information network (SIN) has emerged as a promising method to facilitate global coverage and seamless service. This paper proposes a novel NFV-enabled SIN to provide end-to-end communication and computation services for ground users. Based on the multi-functional time expanded graph (MF-TEG), we jointly optimize the user association, virtual network function (VNF) deployment, and flow routing strategy (U-VNF-R) to maximize the total processed data received by users. The original problem is a mixed-integer linear program (MILP) that is intractable for classical computers. Inspired by quantum computing techniques, we propose a hybrid quantum-classical Benders' decomposition (HQCBD) algorithm. Specifically, we convert the master problem of the Benders' decomposition into the quadratic unconstrained binary optimization (QUBO) model and solve it with quantum computers. To further accelerate the optimization, we also design a multi-cut strategy based on the quantum advantages in parallel computing. Numerical results demonstrate the effectiveness and efficiency of the proposed algorithm and U-VNF-R scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13508v1-abstract-full').style.display = 'none'; document.getElementById('2409.13508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13345">arXiv:2409.13345</a> <span> [<a href="https://arxiv.org/pdf/2409.13345">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Novel Adaptive Fine-Tuning Algorithm for Multimodal Models: Self-Optimizing Classification and Selection of High-Quality Datasets in Remote Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhixiong Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weibin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyang Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wenbo Ji</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+C">Chenhao Qin</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chenbin Liang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+L">Licheng Jiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13345v1-abstract-short" style="display: inline;"> We propose an adaptive fine-tuning algorithm for multimodal large models. The core steps of this algorithm involve two stages of truncation. First, the vast amount of data is projected into a semantic vector space, and the MiniBatchKMeans algorithm is used for automated clustering. This classification ensures that the data within each cluster exhibit high semantic similarity. Next, we process the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13345v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13345v1-abstract-full" style="display: none;"> We propose an adaptive fine-tuning algorithm for multimodal large models. The core steps of this algorithm involve two stages of truncation. First, the vast amount of data is projected into a semantic vector space, and the MiniBatchKMeans algorithm is used for automated clustering. This classification ensures that the data within each cluster exhibit high semantic similarity. Next, we process the data in each cluster, calculating the translational difference between the original and perturbed data in the multimodal large model's vector space. This difference serves as a generalization metric for the data. Based on this metric, we select the data with high generalization potential for training. We applied this algorithm to train the InternLM-XComposer2-VL-7B model on two 3090 GPUs using one-third of the GeoChat multimodal remote sensing dataset. The results demonstrate that our algorithm outperforms the state-of-the-art baselines. various baselines. The model trained on our optimally chosen one-third dataset, based on experimental validation, exhibited only 1% reduction in performance across various remote sensing metrics compared to the model trained on the full dataset. This approach significantly preserved general-purpose capabilities while reducing training time by 68.2%. Furthermore, the model achieved scores of 89.86 and 77.19 on the UCMerced and AID evaluation datasets, respectively, surpassing the GeoChat dataset by 5.43 and 5.16 points. It only showed a 0.91-point average decrease on the LRBEN evaluation dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13345v1-abstract-full').style.display = 'none'; document.getElementById('2409.13345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10516">arXiv:2409.10516</a> <span> [<a href="https://arxiv.org/pdf/2409.10516">pdf</a>, <a href="https://arxiv.org/format/2409.10516">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RetrievalAttention: Accelerating Long-Context LLM Inference via Vector Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+D">Di Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Meng Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+B">Baotong Lu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Huiqiang Jiang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhenhua Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qianxi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengruidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+B">Bailu Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuqing Yang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lili Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10516v2-abstract-short" style="display: inline;"> Transformer-based Large Language Models (LLMs) have become increasingly important. However, due to the quadratic time complexity of attention computation, scaling LLMs to longer contexts incurs extremely slow inference latency and high GPU memory consumption for caching key-value (KV) vectors. This paper proposes RetrievalAttention, a training-free approach to both accelerate attention computation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10516v2-abstract-full').style.display = 'inline'; document.getElementById('2409.10516v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10516v2-abstract-full" style="display: none;"> Transformer-based Large Language Models (LLMs) have become increasingly important. However, due to the quadratic time complexity of attention computation, scaling LLMs to longer contexts incurs extremely slow inference latency and high GPU memory consumption for caching key-value (KV) vectors. This paper proposes RetrievalAttention, a training-free approach to both accelerate attention computation and reduce GPU memory consumption. By leveraging the dynamic sparsity of attention mechanism, RetrievalAttention proposes to use approximate nearest neighbor search (ANNS) indexes for KV vectors in CPU memory and retrieves the most relevant ones with vector search during generation. Unfortunately, we observe that the off-the-shelf ANNS indexes are often ineffective for such retrieval tasks due to the out-of-distribution (OOD) between query vectors and key vectors in attention mechanism. RetrievalAttention addresses the OOD challenge by designing an attention-aware vector search algorithm that can adapt to the distribution of query vectors. Our evaluation shows that RetrievalAttention only needs to access 1--3% of data while maintaining high model accuracy. This leads to significant reduction in the inference cost of long-context LLMs with much lower GPU memory footprint. In particular, RetrievalAttention only needs a single NVIDIA RTX4090 (24GB) for serving 128K tokens in LLMs with 8B parameters, which is capable of generating one token in 0.188 seconds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10516v2-abstract-full').style.display = 'none'; document.getElementById('2409.10516v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07924">arXiv:2409.07924</a> <span> [<a href="https://arxiv.org/pdf/2409.07924">pdf</a>, <a href="https://arxiv.org/format/2409.07924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Universal Trajectory Optimization Framework for Differential Drive Robot Class </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengke Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nanhe Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hu Wang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+J">Jianxiong Qiu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhichao Han</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Q">Qiuyu Ren</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+F">Fei Gao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yanjun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07924v2-abstract-short" style="display: inline;"> Differential drive robots are widely used in various scenarios thanks to their straightforward principle, from household service robots to disaster response field robots. There are several types of driving mechanisms for real-world applications, including two-wheeled, four-wheeled skid-steering, tracked robots, and so on. The differences in the driving mechanisms usually require specific kinematic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07924v2-abstract-full').style.display = 'inline'; document.getElementById('2409.07924v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07924v2-abstract-full" style="display: none;"> Differential drive robots are widely used in various scenarios thanks to their straightforward principle, from household service robots to disaster response field robots. There are several types of driving mechanisms for real-world applications, including two-wheeled, four-wheeled skid-steering, tracked robots, and so on. The differences in the driving mechanisms usually require specific kinematic modeling when precise control is desired. Furthermore, the nonholonomic dynamics and possible lateral slip lead to different degrees of difficulty in getting feasible and high-quality trajectories. Therefore, a comprehensive trajectory optimization framework to compute trajectories efficiently for various kinds of differential drive robots is highly desirable. In this paper, we propose a universal trajectory optimization framework that can be applied to differential drive robots, enabling the generation of high-quality trajectories within a restricted computational timeframe. We introduce a novel trajectory representation based on polynomial parameterization of motion states or their integrals, such as angular and linear velocities, which inherently matches the robots' motion to the control principle. The trajectory optimization problem is formulated to minimize complexity while prioritizing safety and operational efficiency. We then build a full-stack autonomous planning and control system to demonstrate its feasibility and robustness. We conduct extensive simulations and real-world testing in crowded environments with three kinds of differential drive robots to validate the effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07924v2-abstract-full').style.display = 'none'; document.getElementById('2409.07924v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07374">arXiv:2409.07374</a> <span> [<a href="https://arxiv.org/pdf/2409.07374">pdf</a>, <a href="https://arxiv.org/format/2409.07374">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Extracting TCPIP Headers at High Speed for the Anonymized Network Traffic Graph Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhaoyang Han</a>, <a href="/search/cs?searchtype=author&query=Briasco-Stewart%2C+A">Andrew Briasco-Stewart</a>, <a href="/search/cs?searchtype=author&query=Zink%2C+M">Michael Zink</a>, <a href="/search/cs?searchtype=author&query=Leeser%2C+M">Miriam Leeser</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07374v1-abstract-short" style="display: inline;"> Field Programmable Gate Arrays (FPGAs) play a significant role in computationally intensive network processing due to their flexibility and efficiency. Particularly with the high-level abstraction of the P4 network programming model, FPGA shows a powerful potential for packet processing. By supporting the P4 language with FPGA processing, network researchers can create customized FPGA-based networ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07374v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07374v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07374v1-abstract-full" style="display: none;"> Field Programmable Gate Arrays (FPGAs) play a significant role in computationally intensive network processing due to their flexibility and efficiency. Particularly with the high-level abstraction of the P4 network programming model, FPGA shows a powerful potential for packet processing. By supporting the P4 language with FPGA processing, network researchers can create customized FPGA-based network functions and execute network tasks on accelerators directly connected to the network. A feature of the P4 language is that it is stateless; however, the FPGA implementation in this research requires state information. This is accomplished using P4 externs to describe the stateful portions of the design and to implement them on the FPGA using High-Level Synthesis (HLS). This paper demonstrates using an FPGA-based SmartNIC to efficiently extract source-destination IP address information from network packets and construct anonymized network traffic matrices for further analysis. The implementation is the first example of the combination of using P4 and HLS in developing network functions on the latest AMD FPGAs. Our design achieves a processing rate of approximately 95 Gbps with the combined use of P4 and High-level Synthesis and is able to keep up with 100 Gbps traffic received directly from the network. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07374v1-abstract-full').style.display = 'none'; document.getElementById('2409.07374v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05477">arXiv:2409.05477</a> <span> [<a href="https://arxiv.org/pdf/2409.05477">pdf</a>, <a href="https://arxiv.org/format/2409.05477">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Retrofitting Temporal Graph Neural Networks with Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qiang Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiao Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+S+X">Susie Xi Rao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhichao Han</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+F">Fangcheng Fu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jiawei Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05477v3-abstract-short" style="display: inline;"> Temporal graph neural networks (TGNNs) outperform regular GNNs by incorporating time information into graph-based operations. However, TGNNs adopt specialized models (e.g., TGN, TGAT, and APAN ) and require tailored training frameworks (e.g., TGL and ETC). In this paper, we propose TF-TGN, which uses Transformer decoder as the backbone model for TGNN to enjoy Transformer's codebase for efficient t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05477v3-abstract-full').style.display = 'inline'; document.getElementById('2409.05477v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05477v3-abstract-full" style="display: none;"> Temporal graph neural networks (TGNNs) outperform regular GNNs by incorporating time information into graph-based operations. However, TGNNs adopt specialized models (e.g., TGN, TGAT, and APAN ) and require tailored training frameworks (e.g., TGL and ETC). In this paper, we propose TF-TGN, which uses Transformer decoder as the backbone model for TGNN to enjoy Transformer's codebase for efficient training. In particular, Transformer achieves tremendous success for language modeling, and thus the community developed high-performance kernels (e.g., flash-attention and memory-efficient attention) and efficient distributed training schemes (e.g., PyTorch FSDP, DeepSpeed, and Megatron-LM). We observe that TGNN resembles language modeling, i.e., the message aggregation operation between chronologically occurring nodes and their temporal neighbors in TGNNs can be structured as sequence modeling. Beside this similarity, we also incorporate a series of algorithm designs including suffix infilling, temporal graph attention with self-loop, and causal masking self-attention to make TF-TGN work. During training, existing systems are slow in transforming the graph topology and conducting graph sampling. As such, we propose methods to parallelize the CSR format conversion and graph sampling. We also adapt Transformer codebase to train TF-TGN efficiently with multiple GPUs. We experiment with 9 graphs and compare with 2 state-of-the-art TGNN training frameworks. The results show that TF-TGN can accelerate training by over 2.20 while providing comparable or even superior accuracy to existing SOTA TGNNs. TF-TGN is available at https://github.com/qianghuangwhu/TF-TGN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05477v3-abstract-full').style.display = 'none'; document.getElementById('2409.05477v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">conference Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03733">arXiv:2409.03733</a> <span> [<a href="https://arxiv.org/pdf/2409.03733">pdf</a>, <a href="https://arxiv.org/format/2409.03733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Planning In Natural Language Improves LLM Search For Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+E">Evan Wang</a>, <a href="/search/cs?searchtype=author&query=Cassano%2C+F">Federico Cassano</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Catherine Wu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yunfeng Bai</a>, <a href="/search/cs?searchtype=author&query=Song%2C+W">Will Song</a>, <a href="/search/cs?searchtype=author&query=Nath%2C+V">Vaskar Nath</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Ziwen Han</a>, <a href="/search/cs?searchtype=author&query=Hendryx%2C+S">Sean Hendryx</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+S">Summer Yue</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hugh Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03733v2-abstract-short" style="display: inline;"> While scaling training compute has led to remarkable improvements in large language models (LLMs), scaling inference compute has not yet yielded analogous gains. We hypothesize that a core missing component is a lack of diverse LLM outputs, leading to inefficient search due to models repeatedly sampling highly similar, yet incorrect generations. We empirically demonstrate that this lack of diversi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03733v2-abstract-full').style.display = 'inline'; document.getElementById('2409.03733v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03733v2-abstract-full" style="display: none;"> While scaling training compute has led to remarkable improvements in large language models (LLMs), scaling inference compute has not yet yielded analogous gains. We hypothesize that a core missing component is a lack of diverse LLM outputs, leading to inefficient search due to models repeatedly sampling highly similar, yet incorrect generations. We empirically demonstrate that this lack of diversity can be mitigated by searching over candidate plans for solving a problem in natural language. Based on this insight, we propose PlanSearch, a novel search algorithm which shows strong results across HumanEval+, MBPP+, and LiveCodeBench (a contamination-free benchmark for competitive coding). PlanSearch generates a diverse set of observations about the problem and then uses these observations to construct plans for solving the problem. By searching over plans in natural language rather than directly over code solutions, PlanSearch explores a significantly more diverse range of potential solutions compared to baseline search methods. Using PlanSearch on top of Claude 3.5 Sonnet achieves a state-of-the-art pass@200 of 77.0% on LiveCodeBench, outperforming both the best score achieved without search (pass@1 = 41.4%) and using standard repeated sampling (pass@200 = 60.6%). Finally, we show that, across all models, search algorithms, and benchmarks analyzed, we can accurately predict performance gains due to search as a direct function of the diversity over generated ideas. Code can be found at https://github.com/scaleapi/plansearch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03733v2-abstract-full').style.display = 'none'; document.getElementById('2409.03733v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00298">arXiv:2409.00298</a> <span> [<a href="https://arxiv.org/pdf/2409.00298">pdf</a>, <a href="https://arxiv.org/format/2409.00298">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Dual-Polarized Reconfigurable Intelligent Surface-Based Antenna for Holographic MIMO Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shuhao Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+B">Boya Di</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00298v1-abstract-short" style="display: inline;"> Holographic multiple-input-multiple output (HMIMO), which is enabled by large-scale antenna arrays with quasi-continuous apertures, is expected to be an important technology in the forthcoming 6G wireless network. Reconfigurable intelligent surface (RIS)-based antennas provide an energy-efficient solution for implementing HMIMO. Most existing works in this area focus on single-polarized RIS-enable… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00298v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00298v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00298v1-abstract-full" style="display: none;"> Holographic multiple-input-multiple output (HMIMO), which is enabled by large-scale antenna arrays with quasi-continuous apertures, is expected to be an important technology in the forthcoming 6G wireless network. Reconfigurable intelligent surface (RIS)-based antennas provide an energy-efficient solution for implementing HMIMO. Most existing works in this area focus on single-polarized RIS-enabled HMIMO, where the RIS can only reflect signals in one polarization towards users and signals in the other polarization cannot be received by intended users, leading to degraded data rate. To improve multiplexing performance, in this paper, we consider a dual-polarized RIS-enabled single-user HMIMO network, aiming to optimize power allocations across polarizations and analyze corresponding maximum system capacity. However, due to interference between different polarizations, the dual-polarized system cannot be simply decomposed into two independent single-polarized ones. Therefore, existing methods developed for the single-polarized system cannot be directly applied, which makes the optimization and analysis of the dual-polarized system challenging. To cope with this issue, we derive an asymptotically tight upper bound on the ergodic capacity, based on which the power allocations across two polarizations are optimized. Potential gains achievable with such dual-polarized RIS are analyzed. Numerical results verify our analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00298v1-abstract-full').style.display = 'none'; document.getElementById('2409.00298v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15978">arXiv:2408.15978</a> <span> [<a href="https://arxiv.org/pdf/2408.15978">pdf</a>, <a href="https://arxiv.org/format/2408.15978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> WebPilot: A Versatile and Autonomous Multi-Agent System for Web Task Execution with Strategic Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zijian Ma</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yunpu Ma</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhen Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Tresp%2C+V">Volker Tresp</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15978v1-abstract-short" style="display: inline;"> LLM-based autonomous agents often fail to execute complex web tasks that require dynamic interaction due to the inherent uncertainty and complexity of these environments. Existing LLM-based web agents typically rely on rigid, expert-designed policies specific to certain states and actions, which lack the flexibility and generalizability needed to adapt to unseen tasks. In contrast, humans excel by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15978v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15978v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15978v1-abstract-full" style="display: none;"> LLM-based autonomous agents often fail to execute complex web tasks that require dynamic interaction due to the inherent uncertainty and complexity of these environments. Existing LLM-based web agents typically rely on rigid, expert-designed policies specific to certain states and actions, which lack the flexibility and generalizability needed to adapt to unseen tasks. In contrast, humans excel by exploring unknowns, continuously adapting strategies, and resolving ambiguities through exploration. To emulate human-like adaptability, web agents need strategic exploration and complex decision-making. Monte Carlo Tree Search (MCTS) is well-suited for this, but classical MCTS struggles with vast action spaces, unpredictable state transitions, and incomplete information in web tasks. In light of this, we develop WebPilot, a multi-agent system with a dual optimization strategy that improves MCTS to better handle complex web environments. Specifically, the Global Optimization phase involves generating a high-level plan by breaking down tasks into manageable subtasks and continuously refining this plan, thereby focusing the search process and mitigating the challenges posed by vast action spaces in classical MCTS. Subsequently, the Local Optimization phase executes each subtask using a tailored MCTS designed for complex environments, effectively addressing uncertainties and managing incomplete information. Experimental results on WebArena and MiniWoB++ demonstrate the effectiveness of WebPilot. Notably, on WebArena, WebPilot achieves SOTA performance with GPT-4, achieving a 93% relative increase in success rate over the concurrent tree search-based method. WebPilot marks a significant advancement in general autonomous agent capabilities, paving the way for more advanced and reliable decision-making in practical environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15978v1-abstract-full').style.display = 'none'; document.getElementById('2408.15978v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15221">arXiv:2408.15221</a> <span> [<a href="https://arxiv.org/pdf/2408.15221">pdf</a>, <a href="https://arxiv.org/format/2408.15221">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+N">Nathaniel Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Ziwen Han</a>, <a href="/search/cs?searchtype=author&query=Steneker%2C+I">Ian Steneker</a>, <a href="/search/cs?searchtype=author&query=Primack%2C+W">Willow Primack</a>, <a href="/search/cs?searchtype=author&query=Goodside%2C+R">Riley Goodside</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hugh Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zifan Wang</a>, <a href="/search/cs?searchtype=author&query=Menghini%2C+C">Cristina Menghini</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+S">Summer Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15221v2-abstract-short" style="display: inline;"> Recent large language model (LLM) defenses have greatly improved models' ability to refuse harmful queries, even when adversarially attacked. However, LLM defenses are primarily evaluated against automated adversarial attacks in a single turn of conversation, an insufficient threat model for real-world malicious use. We demonstrate that multi-turn human jailbreaks uncover significant vulnerabiliti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15221v2-abstract-full').style.display = 'inline'; document.getElementById('2408.15221v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15221v2-abstract-full" style="display: none;"> Recent large language model (LLM) defenses have greatly improved models' ability to refuse harmful queries, even when adversarially attacked. However, LLM defenses are primarily evaluated against automated adversarial attacks in a single turn of conversation, an insufficient threat model for real-world malicious use. We demonstrate that multi-turn human jailbreaks uncover significant vulnerabilities, exceeding 70% attack success rate (ASR) on HarmBench against defenses that report single-digit ASRs with automated single-turn attacks. Human jailbreaks also reveal vulnerabilities in machine unlearning defenses, successfully recovering dual-use biosecurity knowledge from unlearned models. We compile these results into Multi-Turn Human Jailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks. We publicly release MHJ alongside a compendium of jailbreak tactics developed across dozens of commercial red teaming engagements, supporting research towards stronger LLM defenses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15221v2-abstract-full').style.display = 'none'; document.getElementById('2408.15221v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14279">arXiv:2408.14279</a> <span> [<a href="https://arxiv.org/pdf/2408.14279">pdf</a>, <a href="https://arxiv.org/format/2408.14279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Local Pattern Modularization for Point Cloud Reconstruction from Unseen Classes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chao Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Shen Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhizhong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14279v2-abstract-short" style="display: inline;"> It is challenging to reconstruct 3D point clouds in unseen classes from single 2D images. Instead of object-centered coordinate system, current methods generalized global priors learned in seen classes to reconstruct 3D shapes from unseen classes in viewer-centered coordinate system. However, the reconstruction accuracy and interpretability are still eager to get improved. To resolve this issue, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14279v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14279v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14279v2-abstract-full" style="display: none;"> It is challenging to reconstruct 3D point clouds in unseen classes from single 2D images. Instead of object-centered coordinate system, current methods generalized global priors learned in seen classes to reconstruct 3D shapes from unseen classes in viewer-centered coordinate system. However, the reconstruction accuracy and interpretability are still eager to get improved. To resolve this issue, we introduce to learn local pattern modularization for reconstructing 3D shapes in unseen classes, which achieves both good generalization ability and high reconstruction accuracy. Our insight is to learn a local prior which is class-agnostic and easy to generalize in object-centered coordinate system. Specifically, the local prior is learned via a process of learning and customizing local pattern modularization in seen classes. During this process, we first learn a set of patterns in local regions, which is the basis in the object-centered coordinate system to represent an arbitrary region on shapes across different classes. Then, we modularize each region on an initially reconstructed shape using the learned local patterns. Based on that, we customize the local pattern modularization using the input image by refining the reconstruction with more details. Our method enables to reconstruct high fidelity point clouds from unseen classes in object-centered coordinate system without requiring a large number of patterns or any additional information, such as segmentation supervision or camera poses. Our experimental results under widely used benchmarks show that our method achieves the state-of-the-art reconstruction accuracy for shapes from unseen classes. The code is available at https://github.com/chenchao15/Unseen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14279v2-abstract-full').style.display = 'none'; document.getElementById('2408.14279v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14pages, 11figures, accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12860">arXiv:2408.12860</a> <span> [<a href="https://arxiv.org/pdf/2408.12860">pdf</a>, <a href="https://arxiv.org/format/2408.12860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Active STAR-RIS Empowered Edge System for Enhanced Energy Efficiency and Task Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aung%2C+P+S">Pyae Sone Aung</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kitae Kim</a>, <a href="/search/cs?searchtype=author&query=Tun%2C+Y+K">Yan Kyaw Tun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C+S">Choong Seon Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12860v1-abstract-short" style="display: inline;"> The proliferation of data-intensive and low-latency applications has driven the development of multi-access edge computing (MEC) as a viable solution to meet the increasing demands for high-performance computing and storage capabilities at the network edge. Despite the benefits of MEC, challenges such as obstructions cause non-line-of-sight (NLoS) communication to persist. Reconfigurable intellige… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12860v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12860v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12860v1-abstract-full" style="display: none;"> The proliferation of data-intensive and low-latency applications has driven the development of multi-access edge computing (MEC) as a viable solution to meet the increasing demands for high-performance computing and storage capabilities at the network edge. Despite the benefits of MEC, challenges such as obstructions cause non-line-of-sight (NLoS) communication to persist. Reconfigurable intelligent surfaces (RISs) and the more advanced simultaneously transmitting and reflecting (STAR)-RISs have emerged to address these challenges; however, practical limitations and multiplicative fading effects hinder their efficacy. We propose an active STAR-RIS-assisted MEC system to overcome these obstacles, leveraging the advantages of active STAR-RIS. The main contributions consist of formulating an optimization problem to minimize energy consumption with task queue stability by jointly optimizing the partial task offloading, amplitude, phase shift coefficients, amplification coefficients, transmit power of the base station (BS), and admitted tasks. Furthermore, we decompose the non-convex problem into manageable sub-problems, employing sequential fractional programming for transmit power control, convex optimization technique for task offloading, and Lyapunov optimization with double deep Q-network (DDQN) for joint amplitude, phase shift, amplification, and task admission. Extensive performance evaluations demonstrate the superiority of the proposed system over benchmark schemes, highlighting its potential for enhancing MEC system performance. Numerical results indicate that our proposed system outperforms the conventional STAR-RIS-assisted by 18.64\% and the conventional RIS-assisted system by 30.43\%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12860v1-abstract-full').style.display = 'none'; document.getElementById('2408.12860v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12616">arXiv:2408.12616</a> <span> [<a href="https://arxiv.org/pdf/2408.12616">pdf</a>, <a href="https://arxiv.org/format/2408.12616">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Semantic Communication based on Large Language Model for Underwater Image Transmission </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weilong Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenxuan Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haoran Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinran Zhang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhijin Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanru Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12616v2-abstract-short" style="display: inline;"> Underwater communication is essential for environmental monitoring, marine biology research, and underwater exploration. Traditional underwater communication faces limitations like low bandwidth, high latency, and susceptibility to noise, while semantic communication (SC) offers a promising solution by focusing on the exchange of semantics rather than symbols or bits. However, SC encounters challe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12616v2-abstract-full').style.display = 'inline'; document.getElementById('2408.12616v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12616v2-abstract-full" style="display: none;"> Underwater communication is essential for environmental monitoring, marine biology research, and underwater exploration. Traditional underwater communication faces limitations like low bandwidth, high latency, and susceptibility to noise, while semantic communication (SC) offers a promising solution by focusing on the exchange of semantics rather than symbols or bits. However, SC encounters challenges in underwater environments, including semantic information mismatch and difficulties in accurately identifying and transmitting critical information that aligns with the diverse requirements of underwater applications. To address these challenges, we propose a novel Semantic Communication (SC) framework based on Large Language Models (LLMs). Our framework leverages visual LLMs to perform semantic compression and prioritization of underwater image data according to the query from users. By identifying and encoding key semantic elements within the images, the system selectively transmits high-priority information while applying higher compression rates to less critical regions. On the receiver side, an LLM-based recovery mechanism, along with Global Vision ControlNet and Key Region ControlNet networks, aids in reconstructing the images, thereby enhancing communication efficiency and robustness. Our framework reduces the overall data size to 0.8\% of the original. Experimental results demonstrate that our method significantly outperforms existing approaches, ensuring high-quality, semantically accurate image reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12616v2-abstract-full').style.display = 'none'; document.getElementById('2408.12616v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12548">arXiv:2408.12548</a> <span> [<a href="https://arxiv.org/pdf/2408.12548">pdf</a>, <a href="https://arxiv.org/format/2408.12548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Human-In-The-Loop Machine Learning for Safe and Ethical Autonomous Vehicles: Principles, Challenges, and Opportunities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Emami%2C+Y">Yousef Emami</a>, <a href="/search/cs?searchtype=author&query=Almeida%2C+L">Luis Almeida</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+W">Wei Ni</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12548v2-abstract-short" style="display: inline;"> Rapid advances in Machine Learning (ML) have triggered new trends in Autonomous Vehicles (AVs). ML algorithms play a crucial role in interpreting sensor data, predicting potential hazards, and optimizing navigation strategies. However, achieving full autonomy in cluttered and complex situations, such as intricate intersections, diverse sceneries, varied trajectories, and complex missions, is still… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12548v2-abstract-full').style.display = 'inline'; document.getElementById('2408.12548v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12548v2-abstract-full" style="display: none;"> Rapid advances in Machine Learning (ML) have triggered new trends in Autonomous Vehicles (AVs). ML algorithms play a crucial role in interpreting sensor data, predicting potential hazards, and optimizing navigation strategies. However, achieving full autonomy in cluttered and complex situations, such as intricate intersections, diverse sceneries, varied trajectories, and complex missions, is still challenging, and the cost of data labeling remains a significant bottleneck. The adaptability and robustness of humans in complex scenarios motivate the inclusion of humans in the ML process, leveraging their creativity, ethical power, and emotional intelligence to improve ML effectiveness. The scientific community knows this approach as Human-In-The-Loop Machine Learning (HITL-ML). Towards safe and ethical autonomy, we present a review of HITL-ML for AVs, focusing on Curriculum Learning (CL), Human-In-The-Loop Reinforcement Learning (HITL-RL), Active Learning (AL), and ethical principles. In CL, human experts systematically train ML models by starting with simple tasks and gradually progressing to more difficult ones. HITL-RL significantly enhances the RL process by incorporating human input through techniques like reward shaping, action injection, and interactive learning. AL streamlines the annotation process by targeting specific instances that need to be labeled with human oversight, reducing the overall time and cost associated with training. Ethical principles must be embedded in AVs to align their behavior with societal values and norms. In addition, we provide insights and specify future research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12548v2-abstract-full').style.display = 'none'; document.getElementById('2408.12548v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 00 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> A.1; I.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10211">arXiv:2408.10211</a> <span> [<a href="https://arxiv.org/pdf/2408.10211">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Logic">math.LO</span> </div> </div> <p class="title is-5 mathjax"> G枚del Incompleteness Theorem for PAC Learnable Theory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhifeng Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianyi Wu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhangang Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10211v1-abstract-short" style="display: inline;"> Different from the view that information is objective reality, we adopts the idea that all information needs to be compiled by the interpreter before it can be observed. Then, the interpreter-based information theory with its complexity is given by formula. Through the relationship between complexity and PAC learnability, the general PAC learnability corresponding to general complexity is derived.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10211v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10211v1-abstract-full" style="display: none;"> Different from the view that information is objective reality, we adopts the idea that all information needs to be compiled by the interpreter before it can be observed. Then, the interpreter-based information theory with its complexity is given by formula. Through the relationship between complexity and PAC learnability, the general PAC learnability corresponding to general complexity is derived. Then we generalize the observation process to the formal system with functors, in which we give concrete proof of the generalized G枚del incompleteness theorem. Finally, the Cantor set, the original G枚del incompleteness theorem, the Turing halting problem and the EPR paradox are put into the same mathematical framework as the specific applications of the general G枚del incompleteness theorem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10211v1-abstract-full').style.display = 'none'; document.getElementById('2408.10211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06969">arXiv:2408.06969</a> <span> [<a href="https://arxiv.org/pdf/2408.06969">pdf</a>, <a href="https://arxiv.org/ps/2408.06969">ps</a>, <a href="https://arxiv.org/format/2408.06969">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> IRS-Assisted Lossy Communications Under Correlated Rayleigh Fading: Outage Probability Analysis and Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanchang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wensheng Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lixin Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yixuan He</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fucheng Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06969v1-abstract-short" style="display: inline;"> This paper focuses on an intelligent reflecting surface (IRS)-assisted lossy communication system with correlated Rayleigh fading. We analyze the correlated channel model and derive the outage probability of the system. Then, we design a deep reinforce learning (DRL) method to optimize the phase shift of IRS, in order to maximize the received signal power. Moreover, this paper presents results of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06969v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06969v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06969v1-abstract-full" style="display: none;"> This paper focuses on an intelligent reflecting surface (IRS)-assisted lossy communication system with correlated Rayleigh fading. We analyze the correlated channel model and derive the outage probability of the system. Then, we design a deep reinforce learning (DRL) method to optimize the phase shift of IRS, in order to maximize the received signal power. Moreover, this paper presents results of the simulations conducted to evaluate the performance of the DRL-based method. The simulation results indicate that the outage probability of the considered system increases significantly with more correlated channel coefficients. Moreover, the performance gap between DRL and theoretical limit increases with higher transmit power and/or larger distortion requirement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06969v1-abstract-full').style.display = 'none'; document.getElementById('2408.06969v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06635">arXiv:2408.06635</a> <span> [<a href="https://arxiv.org/pdf/2408.06635">pdf</a>, <a href="https://arxiv.org/format/2408.06635">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IDRetracor: Towards Visual Forensics Against Malicious Face Swapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jikang Cheng</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+J">Jiaxin Ai</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhen Han</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chao Liang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Q">Qin Zou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06635v1-abstract-short" style="display: inline;"> The face swapping technique based on deepfake methods poses significant social risks to personal identity security. While numerous deepfake detection methods have been proposed as countermeasures against malicious face swapping, they can only output binary labels (Fake/Real) for distinguishing fake content without reliable and traceable evidence. To achieve visual forensics and target face attribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06635v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06635v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06635v1-abstract-full" style="display: none;"> The face swapping technique based on deepfake methods poses significant social risks to personal identity security. While numerous deepfake detection methods have been proposed as countermeasures against malicious face swapping, they can only output binary labels (Fake/Real) for distinguishing fake content without reliable and traceable evidence. To achieve visual forensics and target face attribution, we propose a novel task named face retracing, which considers retracing the original target face from the given fake one via inverse mapping. Toward this goal, we propose an IDRetracor that can retrace arbitrary original target identities from fake faces generated by multiple face swapping methods. Specifically, we first adopt a mapping resolver to perceive the possible solution space of the original target face for the inverse mappings. Then, we propose mapping-aware convolutions to retrace the original target face from the fake one. Such convolutions contain multiple kernels that can be combined under the control of the mapping resolver to tackle different face swapping mappings dynamically. Extensive experiments demonstrate that the IDRetracor exhibits promising retracing performance from both quantitative and qualitative perspectives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06635v1-abstract-full').style.display = 'none'; document.getElementById('2408.06635v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05112">arXiv:2408.05112</a> <span> [<a href="https://arxiv.org/pdf/2408.05112">pdf</a>, <a href="https://arxiv.org/format/2408.05112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Semantic Successive Refinement: A Generative AI-aided Semantic Communication Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kexin Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lixin Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wensheng Lin</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuna Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wenchi Cheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05112v1-abstract-short" style="display: inline;"> Semantic Communication (SC) is an emerging technology aiming to surpass the Shannon limit. Traditional SC strategies often minimize signal distortion between the original and reconstructed data, neglecting perceptual quality, especially in low Signal-to-Noise Ratio (SNR) environments. To address this issue, we introduce a novel Generative AI Semantic Communication (GSC) system for single-user scen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05112v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05112v1-abstract-full" style="display: none;"> Semantic Communication (SC) is an emerging technology aiming to surpass the Shannon limit. Traditional SC strategies often minimize signal distortion between the original and reconstructed data, neglecting perceptual quality, especially in low Signal-to-Noise Ratio (SNR) environments. To address this issue, we introduce a novel Generative AI Semantic Communication (GSC) system for single-user scenarios. This system leverages deep generative models to establish a new paradigm in SC. Specifically, At the transmitter end, it employs a joint source-channel coding mechanism based on the Swin Transformer for efficient semantic feature extraction and compression. At the receiver end, an advanced Diffusion Model (DM) reconstructs high-quality images from degraded signals, enhancing perceptual details. Additionally, we present a Multi-User Generative Semantic Communication (MU-GSC) system utilizing an asynchronous processing model. This model effectively manages multiple user requests and optimally utilizes system resources for parallel processing. Simulation results on public datasets demonstrate that our generative AI semantic communication systems achieve superior transmission efficiency and enhanced communication content quality across various channel conditions. Compared to CNN-based DeepJSCC, our methods improve the Peak Signal-to-Noise Ratio (PSNR) by 17.75% in Additive White Gaussian Noise (AWGN) channels and by 20.86% in Rayleigh channels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05112v1-abstract-full').style.display = 'none'; document.getElementById('2408.05112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>