Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 379 results for author: <span class="mathjax">Fang, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Fang%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Fang, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Fang%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Fang, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13612">arXiv:2411.13612</a> <span> [<a href="https://arxiv.org/pdf/2411.13612">pdf</a>, <a href="https://arxiv.org/format/2411.13612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Efficient Streaming Voice Steganalysis in Challenging Detection Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pengcheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengyang Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhongliang Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhili Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Linna Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13612v1-abstract-short" style="display: inline;"> In recent years, there has been an increasing number of information hiding techniques based on network streaming media, focusing on how to covertly and efficiently embed secret information into real-time transmitted network media signals to achieve concealed communication. The misuse of these techniques can lead to significant security risks, such as the spread of malicious code, commands, and vir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13612v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13612v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13612v1-abstract-full" style="display: none;"> In recent years, there has been an increasing number of information hiding techniques based on network streaming media, focusing on how to covertly and efficiently embed secret information into real-time transmitted network media signals to achieve concealed communication. The misuse of these techniques can lead to significant security risks, such as the spread of malicious code, commands, and viruses. Current steganalysis methods for network voice streams face two major challenges: efficient detection under low embedding rates and short duration conditions. These challenges arise because, with low embedding rates (e.g., as low as 10%) and short transmission durations (e.g., only 0.1 second), detection models struggle to acquire sufficiently rich sample features, making effective steganalysis difficult. To address these challenges, this paper introduces a Dual-View VoIP Steganalysis Framework (DVSF). The framework first randomly obfuscates parts of the native steganographic descriptors in VoIP stream segments, making the steganographic features of hard-to-detect samples more pronounced and easier to learn. It then captures fine-grained local features related to steganography, building on the global features of VoIP. Specially constructed VoIP segment triplets further adjust the feature distances within the model. Ultimately, this method effectively address the detection difficulty in VoIP. Extensive experiments demonstrate that our method significantly improves the accuracy of streaming voice steganalysis in these challenging detection scenarios, surpassing existing state-of-the-art methods and offering superior near-real-time performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13612v1-abstract-full').style.display = 'none'; document.getElementById('2411.13612v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10948">arXiv:2411.10948</a> <span> [<a href="https://arxiv.org/pdf/2411.10948">pdf</a>, <a href="https://arxiv.org/format/2411.10948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Accurate and Efficient Sub-8-Bit Integer Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+W">Wenjin Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Donglai Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Weiying Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunsong Li</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+X">Xuefei Ning</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Z">Zihan Meng</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shulin Zeng</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+J">Jie Lei</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhenman Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10948v1-abstract-short" style="display: inline;"> Neural network training is a memory- and compute-intensive task. Quantization, which enables low-bitwidth formats in training, can significantly mitigate the workload. To reduce quantization error, recent methods have developed new data formats and additional pre-processing operations on quantizers. However, it remains quite challenging to achieve high accuracy and efficiency simultaneously. In th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10948v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10948v1-abstract-full" style="display: none;"> Neural network training is a memory- and compute-intensive task. Quantization, which enables low-bitwidth formats in training, can significantly mitigate the workload. To reduce quantization error, recent methods have developed new data formats and additional pre-processing operations on quantizers. However, it remains quite challenging to achieve high accuracy and efficiency simultaneously. In this paper, we explore sub-8-bit integer training from its essence of gradient descent optimization. Our integer training framework includes two components: ShiftQuant to realize accurate gradient estimation, and L1 normalization to smoothen the loss landscape. ShiftQuant attains performance that approaches the theoretical upper bound of group quantization. Furthermore, it liberates group quantization from inefficient memory rearrangement. The L1 normalization facilitates the implementation of fully quantized normalization layers with impressive convergence accuracy. Our method frees sub-8-bit integer training from pre-processing and supports general devices. This framework achieves negligible accuracy loss across various neural networks and tasks ($0.92\%$ on 4-bit ResNets, $0.61\%$ on 6-bit Transformers). The prototypical implementation of ShiftQuant achieves more than $1.85\times/15.3\%$ performance improvement on CPU/GPU compared to its FP16 counterparts, and $33.9\%$ resource consumption reduction on FPGA than the FP16 counterparts. The proposed fully-quantized L1 normalization layers achieve more than $35.54\%$ improvement in throughout on CPU compared to traditional L2 normalization layers. Moreover, theoretical analysis verifies the advancement of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10948v1-abstract-full').style.display = 'none'; document.getElementById('2411.10948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04595">arXiv:2411.04595</a> <span> [<a href="https://arxiv.org/pdf/2411.04595">pdf</a>, <a href="https://arxiv.org/format/2411.04595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TexLiverNet: Leveraging Medical Knowledge and Spatial-Frequency Perception for Enhanced Liver Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaoyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhi Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hailing Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guozhong Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhijun Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04595v1-abstract-short" style="display: inline;"> Integrating textual data with imaging in liver tumor segmentation is essential for enhancing diagnostic accuracy. However, current multi-modal medical datasets offer only general text annotations, lacking lesion-specific details critical for extracting nuanced features, especially for fine-grained segmentation of tumor boundaries and small lesions. To address these limitations, we developed datase… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04595v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04595v1-abstract-full" style="display: none;"> Integrating textual data with imaging in liver tumor segmentation is essential for enhancing diagnostic accuracy. However, current multi-modal medical datasets offer only general text annotations, lacking lesion-specific details critical for extracting nuanced features, especially for fine-grained segmentation of tumor boundaries and small lesions. To address these limitations, we developed datasets with lesion-specific text annotations for liver tumors and introduced the TexLiverNet model. TexLiverNet employs an agent-based cross-attention module that integrates text features efficiently with visual features, significantly reducing computational costs. Additionally, enhanced spatial and adaptive frequency domain perception is proposed to precisely delineate lesion boundaries, reduce background interference, and recover fine details in small lesions. Comprehensive evaluations on public and private datasets demonstrate that TexLiverNet achieves superior performance compared to current state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04595v1-abstract-full').style.display = 'none'; document.getElementById('2411.04595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03628">arXiv:2411.03628</a> <span> [<a href="https://arxiv.org/pdf/2411.03628">pdf</a>, <a href="https://arxiv.org/format/2411.03628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StreamingBench: Assessing the Gap for MLLMs to Achieve Streaming Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junming Lin</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chi Chen</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zihao Wan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+F">Fuwen Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03628v1-abstract-short" style="display: inline;"> The rapid development of Multimodal Large Language Models (MLLMs) has expanded their capabilities from image comprehension to video understanding. However, most of these MLLMs focus primarily on offline video comprehension, necessitating extensive processing of all video frames before any queries can be made. This presents a significant gap compared to the human ability to watch, listen, think, an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03628v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03628v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03628v1-abstract-full" style="display: none;"> The rapid development of Multimodal Large Language Models (MLLMs) has expanded their capabilities from image comprehension to video understanding. However, most of these MLLMs focus primarily on offline video comprehension, necessitating extensive processing of all video frames before any queries can be made. This presents a significant gap compared to the human ability to watch, listen, think, and respond to streaming inputs in real time, highlighting the limitations of current MLLMs. In this paper, we introduce StreamingBench, the first comprehensive benchmark designed to evaluate the streaming video understanding capabilities of MLLMs. StreamingBench assesses three core aspects of streaming video understanding: (1) real-time visual understanding, (2) omni-source understanding, and (3) contextual understanding. The benchmark consists of 18 tasks, featuring 900 videos and 4,500 human-curated QA pairs. Each video features five questions presented at different time points to simulate a continuous streaming scenario. We conduct experiments on StreamingBench with 13 open-source and proprietary MLLMs and find that even the most advanced proprietary MLLMs like Gemini 1.5 Pro and GPT-4o perform significantly below human-level streaming video understanding capabilities. We hope our work can facilitate further advancements for MLLMs, empowering them to approach human-level video comprehension and interaction in more realistic scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03628v1-abstract-full').style.display = 'none'; document.getElementById('2411.03628v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01217">arXiv:2411.01217</a> <span> [<a href="https://arxiv.org/pdf/2411.01217">pdf</a>, <a href="https://arxiv.org/format/2411.01217">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Preference-CFR$\:$ Beyond Nash Equilibrium for Better Game Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ju%2C+Q">Qi Ju</a>, <a href="/search/cs?searchtype=author&query=Tellier%2C+T">Thomas Tellier</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Meng Sun</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhemei Fang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yunfeng Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01217v1-abstract-short" style="display: inline;"> Recent advancements in artificial intelligence (AI) have leveraged large-scale games as benchmarks to gauge progress, with AI now frequently outperforming human capabilities. Traditionally, this success has largely relied on solving Nash equilibrium (NE) using variations of the counterfactual regret minimization (CFR) method in games with incomplete information. However, the variety of Nash equili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01217v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01217v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01217v1-abstract-full" style="display: none;"> Recent advancements in artificial intelligence (AI) have leveraged large-scale games as benchmarks to gauge progress, with AI now frequently outperforming human capabilities. Traditionally, this success has largely relied on solving Nash equilibrium (NE) using variations of the counterfactual regret minimization (CFR) method in games with incomplete information. However, the variety of Nash equilibria has been largely overlooked in previous research, limiting the adaptability of AI to meet diverse human preferences. To address this challenge, where AI is powerful but struggles to meet customization needs, we introduce a novel approach: Preference-CFR, which incorporates two new parameters: preference degree and vulnerability degree. These parameters allow for greater flexibility in AI strategy development without compromising convergence. Our method significantly alters the distribution of final strategies, enabling the creation of customized AI models that better align with individual user needs. Using Texas Hold'em as a case study, our experiments demonstrate how Preference CFR can be adjusted to either emphasize customization, prioritizing user preferences, or to enhance performance, striking a balance between the depth of customization and strategic optimality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01217v1-abstract-full').style.display = 'none'; document.getElementById('2411.01217v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23079">arXiv:2410.23079</a> <span> [<a href="https://arxiv.org/pdf/2410.23079">pdf</a>, <a href="https://arxiv.org/format/2410.23079">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BUZZ: Beehive-structured Sparse KV Cache with Segmented Heavy Hitters for Efficient LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junqi Zhao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhijin Fang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shu Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shaohui Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shichao He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23079v1-abstract-short" style="display: inline;"> Large language models (LLMs) are essential in natural language processing but often struggle with inference speed and computational efficiency, limiting real-time deployment. The key-value (KV) cache mechanism reduces computational overhead in transformer models, but challenges in maintaining contextual understanding remain. In this paper, we propose BUZZ, a novel KV caching algorithm that leverag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23079v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23079v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23079v1-abstract-full" style="display: none;"> Large language models (LLMs) are essential in natural language processing but often struggle with inference speed and computational efficiency, limiting real-time deployment. The key-value (KV) cache mechanism reduces computational overhead in transformer models, but challenges in maintaining contextual understanding remain. In this paper, we propose BUZZ, a novel KV caching algorithm that leverages structured contextual information to minimize cache memory usage while enhancing inference speed. BUZZ employs a beehive-structured sparse cache, incorporating a sliding window to capture recent information and dynamically segmenting historical tokens into chunks to prioritize important tokens in local neighborhoods. We evaluate BUZZ on four real-world datasets: CNN/Daily Mail, XSUM, Wikitext, and 10-QA. Our results demonstrate that BUZZ (1) reduces cache memory usage by $\textbf{2.5}\times$ in LLM inference while maintaining over 99% accuracy in long-text summarization, and (2) surpasses state-of-the-art performance in multi-document question answering by $\textbf{7.69%}$ under the same memory limit, where full cache methods encounter out-of-memory issues. Additionally, BUZZ achieves significant inference speedup with a $\log{n}$ time complexity. The code is available at https://github.com/JunqiZhao888/buzz-llm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23079v1-abstract-full').style.display = 'none'; document.getElementById('2410.23079v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22852">arXiv:2410.22852</a> <span> [<a href="https://arxiv.org/pdf/2410.22852">pdf</a>, <a href="https://arxiv.org/format/2410.22852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Centimeter-level Geometry Reconstruction and Material Identification in 300 GHz Monostatic Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zitong Fang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Ziming Yu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Chong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22852v1-abstract-short" style="display: inline;"> Terahertz (THz) integrated sensing and communication (ISAC) technology is envisioned to achieve high communication performance alongside advanced sensing abilities. For various applications of ISAC, accurate environment reconstruction including geometry reconstruction and material identification is critical. This paper presents a highly precise geometry reconstruction algorithm and material identi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22852v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22852v1-abstract-full" style="display: none;"> Terahertz (THz) integrated sensing and communication (ISAC) technology is envisioned to achieve high communication performance alongside advanced sensing abilities. For various applications of ISAC, accurate environment reconstruction including geometry reconstruction and material identification is critical. This paper presents a highly precise geometry reconstruction algorithm and material identification scheme for a monostatic sensing case in a typical indoor scenario. Experiments are conducted in the frequency range from 290 GHz to 310 GHz using a vector network analyzer (VNA)-based channel sounder by co-locating the transmitter and receiver. A joint delay and angle space-alternating generalized expectation-maximization (SAGE)-based algorithm is implemented to estimate multipath component (MPC) parameters and the indoor geometry is reconstructed based on the extracted parameters. Furthermore, a geometry-based method is employed to model and remove the spurious path of the corner, reaching an accuracy of 1.75 cm. Additionally, a material database using THz time-domain spectroscopy (THz-TDS) is established, capturing reflection losses of over 200 common material samples. Applying this database to our monostatic sensing, the measured reflection losses of wall and window frame are accurately identified as cement and steel, respectively. Our results demonstrate the centimeter-level geometry reconstruction and accurate material identification for practical THz ISAC scenarios, which unleash unprecedented sensing potential compared to microwave and millimeter-wave bands. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22852v1-abstract-full').style.display = 'none'; document.getElementById('2410.22852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18090">arXiv:2410.18090</a> <span> [<a href="https://arxiv.org/pdf/2410.18090">pdf</a>, <a href="https://arxiv.org/format/2410.18090">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Liver Cancer Knowledge Graph Construction based on dynamic entity replacement and masking strategies RoBERTa-BiLSTM-CRF model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">YiChi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">HaiLing Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">YongBin Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">XiaoJun Hu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">YingFang Fan</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">ZhiJun Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18090v1-abstract-short" style="display: inline;"> Background: Liver cancer ranks as the fifth most common malignant tumor and the second most fatal in our country. Early diagnosis is crucial, necessitating that physicians identify liver cancer in patients at the earliest possible stage. However, the diagnostic process is complex and demanding. Physicians must analyze a broad spectrum of patient data, encompassing physical condition, symptoms, med… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18090v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18090v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18090v1-abstract-full" style="display: none;"> Background: Liver cancer ranks as the fifth most common malignant tumor and the second most fatal in our country. Early diagnosis is crucial, necessitating that physicians identify liver cancer in patients at the earliest possible stage. However, the diagnostic process is complex and demanding. Physicians must analyze a broad spectrum of patient data, encompassing physical condition, symptoms, medical history, and results from various examinations and tests, recorded in both structured and unstructured medical formats. This results in a significant workload for healthcare professionals. In response, integrating knowledge graph technology to develop a liver cancer knowledge graph-assisted diagnosis and treatment system aligns with national efforts toward smart healthcare. Such a system promises to mitigate the challenges faced by physicians in diagnosing and treating liver cancer. Methods: This paper addresses the major challenges in building a knowledge graph for hepatocellular carcinoma diagnosis, such as the discrepancy between public data sources and real electronic medical records, the effective integration of which remains a key issue. The knowledge graph construction process consists of six steps: conceptual layer design, data preprocessing, entity identification, entity normalization, knowledge fusion, and graph visualization. A novel Dynamic Entity Replacement and Masking Strategy (DERM) for named entity recognition is proposed. Results: A knowledge graph for liver cancer was established, including 7 entity types such as disease, symptom, and constitution, containing 1495 entities. The recognition accuracy of the model was 93.23%, the recall was 94.69%, and the F1 score was 93.96%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18090v1-abstract-full').style.display = 'none'; document.getElementById('2410.18090v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10601">arXiv:2410.10601</a> <span> [<a href="https://arxiv.org/pdf/2410.10601">pdf</a>, <a href="https://arxiv.org/format/2410.10601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Fully Asynchronous Neuromorphic Perception for Mobile Robot Dodging with Loihi Chips </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Junjie Jiang</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+D">Delei Kong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chenming Hu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10601v1-abstract-short" style="display: inline;"> Sparse and asynchronous sensing and processing in natural organisms lead to ultra low-latency and energy-efficient perception. Event cameras, known as neuromorphic vision sensors, are designed to mimic these characteristics. However, fully utilizing the sparse and asynchronous event stream remains challenging. Influenced by the mature algorithms of standard cameras, most existing event-based algor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10601v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10601v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10601v1-abstract-full" style="display: none;"> Sparse and asynchronous sensing and processing in natural organisms lead to ultra low-latency and energy-efficient perception. Event cameras, known as neuromorphic vision sensors, are designed to mimic these characteristics. However, fully utilizing the sparse and asynchronous event stream remains challenging. Influenced by the mature algorithms of standard cameras, most existing event-based algorithms still rely on the "group of events" processing paradigm (e.g., event frames, 3D voxels) when handling event streams. This paradigm encounters issues such as feature loss, event stacking, and high computational burden, which deviates from the intended purpose of event cameras. To address these issues, we propose a fully asynchronous neuromorphic paradigm that integrates event cameras, spiking networks, and neuromorphic processors (Intel Loihi). This paradigm can faithfully process each event asynchronously as it arrives, mimicking the spike-driven signal processing in biological brains. We compare the proposed paradigm with the existing "group of events" processing paradigm in detail on the real mobile robot dodging task. Experimental results show that our scheme exhibits better robustness than frame-based methods with different time windows and light conditions. Additionally, the energy consumption per inference of our scheme on the embedded Loihi processor is only 4.30% of that of the event spike tensor method on NVIDIA Jetson Orin NX with energy-saving mode, and 1.64% of that of the event frame method on the same neuromorphic processor. As far as we know, this is the first time that a fully asynchronous neuromorphic paradigm has been implemented for solving sequential tasks on real mobile robot. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10601v1-abstract-full').style.display = 'none'; document.getElementById('2410.10601v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09409">arXiv:2410.09409</a> <span> [<a href="https://arxiv.org/pdf/2410.09409">pdf</a>, <a href="https://arxiv.org/format/2410.09409">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Distribution-aware Noisy-label Crack Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaoyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+X">Xinlong Wan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kaiying Zhu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xihe Qiu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhijun Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09409v1-abstract-short" style="display: inline;"> Road crack segmentation is critical for robotic systems tasked with the inspection, maintenance, and monitoring of road infrastructures. Existing deep learning-based methods for crack segmentation are typically trained on specific datasets, which can lead to significant performance degradation when applied to unseen real-world scenarios. To address this, we introduce the SAM-Adapter, which incorpo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09409v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09409v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09409v1-abstract-full" style="display: none;"> Road crack segmentation is critical for robotic systems tasked with the inspection, maintenance, and monitoring of road infrastructures. Existing deep learning-based methods for crack segmentation are typically trained on specific datasets, which can lead to significant performance degradation when applied to unseen real-world scenarios. To address this, we introduce the SAM-Adapter, which incorporates the general knowledge of the Segment Anything Model (SAM) into crack segmentation, demonstrating enhanced performance and generalization capabilities. However, the effectiveness of the SAM-Adapter is constrained by noisy labels within small-scale training sets, including omissions and mislabeling of cracks. In this paper, we present an innovative joint learning framework that utilizes distribution-aware domain-specific semantic knowledge to guide the discriminative learning process of the SAM-Adapter. To our knowledge, this is the first approach that effectively minimizes the adverse effects of noisy labels on the supervised learning of the SAM-Adapter. Our experimental results on two public pavement crack segmentation datasets confirm that our method significantly outperforms existing state-of-the-art techniques. Furthermore, evaluations on the completely unseen CFD dataset demonstrate the high cross-domain generalization capability of our model, underscoring its potential for practical applications in crack segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09409v1-abstract-full').style.display = 'none'; document.getElementById('2410.09409v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06795">arXiv:2410.06795</a> <span> [<a href="https://arxiv.org/pdf/2410.06795">pdf</a>, <a href="https://arxiv.org/format/2410.06795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From Pixels to Tokens: Revisiting Object Hallucinations in Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shang%2C+Y">Yuying Shang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xinyi Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yutao Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiao Yang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengwei Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiawei Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zinan Liu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yu Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06795v1-abstract-short" style="display: inline;"> Hallucinations in large vision-language models (LVLMs) are a significant challenge, i.e., generating objects that are not presented in the visual input, which impairs their reliability. Recent studies often attribute hallucinations to a lack of understanding of visual input, yet ignore a more fundamental issue: the model's inability to effectively extract or decouple visual features. In this paper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06795v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06795v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06795v1-abstract-full" style="display: none;"> Hallucinations in large vision-language models (LVLMs) are a significant challenge, i.e., generating objects that are not presented in the visual input, which impairs their reliability. Recent studies often attribute hallucinations to a lack of understanding of visual input, yet ignore a more fundamental issue: the model's inability to effectively extract or decouple visual features. In this paper, we revisit the hallucinations in LVLMs from an architectural perspective, investigating whether the primary cause lies in the visual encoder (feature extraction) or the modal alignment module (feature decoupling). Motivated by our findings on the preliminary investigation, we propose a novel tuning strategy, PATCH, to mitigate hallucinations in LVLMs. This plug-and-play method can be integrated into various LVLMs, utilizing adaptive virtual tokens to extract object features from bounding boxes, thereby addressing hallucinations caused by insufficient decoupling of visual features. PATCH achieves state-of-the-art performance on multiple multi-modal hallucination datasets. We hope this approach provides researchers with deeper insights into the underlying causes of hallucinations in LVLMs, fostering further advancements and innovation in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06795v1-abstract-full').style.display = 'none'; document.getElementById('2410.06795v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04320">arXiv:2410.04320</a> <span> [<a href="https://arxiv.org/pdf/2410.04320">pdf</a>, <a href="https://arxiv.org/format/2410.04320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Channel-Aware Throughput Maximization for Cooperative Data Fusion in CAV </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+H">Haonan An</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengru Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuang Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Senkang Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guowen Xu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04320v1-abstract-short" style="display: inline;"> Connected and autonomous vehicles (CAVs) have garnered significant attention due to their extended perception range and enhanced sensing coverage. To address challenges such as blind spots and obstructions, CAVs employ vehicle-to-vehicle (V2V) communications to aggregate sensory data from surrounding vehicles. However, cooperative perception is often constrained by the limitations of achievable ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04320v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04320v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04320v1-abstract-full" style="display: none;"> Connected and autonomous vehicles (CAVs) have garnered significant attention due to their extended perception range and enhanced sensing coverage. To address challenges such as blind spots and obstructions, CAVs employ vehicle-to-vehicle (V2V) communications to aggregate sensory data from surrounding vehicles. However, cooperative perception is often constrained by the limitations of achievable network throughput and channel quality. In this paper, we propose a channel-aware throughput maximization approach to facilitate CAV data fusion, leveraging a self-supervised autoencoder for adaptive data compression. We formulate the problem as a mixed integer programming (MIP) model, which we decompose into two sub-problems to derive optimal data rate and compression ratio solutions under given link conditions. An autoencoder is then trained to minimize bitrate with the determined compression ratio, and a fine-tuning strategy is employed to further reduce spectrum resource consumption. Experimental evaluation on the OpenCOOD platform demonstrates the effectiveness of our proposed algorithm, showing more than 20.19\% improvement in network throughput and a 9.38\% increase in average precision (AP@IoU) compared to state-of-the-art methods, with an optimal latency of 19.99 ms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04320v1-abstract-full').style.display = 'none'; document.getElementById('2410.04320v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04168">arXiv:2410.04168</a> <span> [<a href="https://arxiv.org/pdf/2410.04168">pdf</a>, <a href="https://arxiv.org/format/2410.04168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> R-ACP: Real-Time Adaptive Collaborative Perception Leveraging Robust Task-Oriented Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengru Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingjing Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yanan Ma</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yihang Tao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yiqin Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04168v3-abstract-short" style="display: inline;"> Collaborative perception enhances sensing in multi-robot and vehicular networks by fusing information from multiple agents, improving perception accuracy and sensing range. However, mobility and non-rigid sensor mounts introduce extrinsic calibration errors, necessitating online calibration, further complicated by limited overlap in sensing regions. Moreover, maintaining fresh information is cruci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04168v3-abstract-full').style.display = 'inline'; document.getElementById('2410.04168v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04168v3-abstract-full" style="display: none;"> Collaborative perception enhances sensing in multi-robot and vehicular networks by fusing information from multiple agents, improving perception accuracy and sensing range. However, mobility and non-rigid sensor mounts introduce extrinsic calibration errors, necessitating online calibration, further complicated by limited overlap in sensing regions. Moreover, maintaining fresh information is crucial for timely and accurate sensing. To address calibration errors and ensure timely and accurate perception, we propose a robust task-oriented communication strategy to optimize online self-calibration and efficient feature sharing for Real-time Adaptive Collaborative Perception (R-ACP). Specifically, we first formulate an Age of Perceived Targets (AoPT) minimization problem to capture data timeliness of multi-view streaming. Then, in the calibration phase, we introduce a channel-aware self-calibration technique based on re-identification (Re-ID), which adaptively compresses key features according to channel capacities, effectively addressing calibration issues via spatial and temporal cross-camera correlations. In the streaming phase, we tackle the trade-off between bandwidth and inference accuracy by leveraging an Information Bottleneck (IB) based encoding method to adjust video compression rates based on task relevance, thereby reducing communication overhead and latency. Finally, we design a priority-aware network to filter corrupted features to mitigate performance degradation from packet corruption. Extensive studies demonstrate that our framework outperforms five baselines, improving multiple object detection accuracy (MODA) by 25.49% and reducing communication costs by 51.36% under severely poor channel conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04168v3-abstract-full').style.display = 'none'; document.getElementById('2410.04168v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02592">arXiv:2410.02592</a> <span> [<a href="https://arxiv.org/pdf/2410.02592">pdf</a>, <a href="https://arxiv.org/format/2410.02592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> IC3M: In-Car Multimodal Multi-object Monitoring for Abnormal Status of Both Driver and Passengers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zihan Fang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zheng Lin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Senkang Hu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hangcheng Cao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yiqin Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02592v4-abstract-short" style="display: inline;"> Recently, in-car monitoring has emerged as a promising technology for detecting early-stage abnormal status of the driver and providing timely alerts to prevent traffic accidents. Although training models with multimodal data enhances the reliability of abnormal status detection, the scarcity of labeled data and the imbalance of class distribution impede the extraction of critical abnormal state f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02592v4-abstract-full').style.display = 'inline'; document.getElementById('2410.02592v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02592v4-abstract-full" style="display: none;"> Recently, in-car monitoring has emerged as a promising technology for detecting early-stage abnormal status of the driver and providing timely alerts to prevent traffic accidents. Although training models with multimodal data enhances the reliability of abnormal status detection, the scarcity of labeled data and the imbalance of class distribution impede the extraction of critical abnormal state features, significantly deteriorating training performance. Furthermore, missing modalities due to environment and hardware limitations further exacerbate the challenge of abnormal status identification. More importantly, monitoring abnormal health conditions of passengers, particularly in elderly care, is of paramount importance but remains underexplored. To address these challenges, we introduce our IC3M, an efficient camera-rotation-based multimodal framework for monitoring both driver and passengers in a car. Our IC3M comprises two key modules: an adaptive threshold pseudo-labeling strategy and a missing modality reconstruction. The former customizes pseudo-labeling thresholds for different classes based on the class distribution, generating class-balanced pseudo labels to guide model training effectively, while the latter leverages crossmodality relationships learned from limited labels to accurately recover missing modalities by distribution transferring from available modalities. Extensive experimental results demonstrate that IC3M outperforms state-of-the-art benchmarks in accuracy, precision, and recall while exhibiting superior robustness under limited labeled data and severe missing modality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02592v4-abstract-full').style.display = 'none'; document.getElementById('2410.02592v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 17 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15259">arXiv:2409.15259</a> <span> [<a href="https://arxiv.org/pdf/2409.15259">pdf</a>, <a href="https://arxiv.org/format/2409.15259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> S$^2$AG-Vid: Enhancing Multi-Motion Alignment in Video Diffusion Models via Spatial and Syntactic Attention-Based Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuanhang Li</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Q">Qi Mao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lan Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhen Fang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+L">Lei Tian</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xinyan Xiao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+L">Libiao Jin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hua Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15259v1-abstract-short" style="display: inline;"> Recent advancements in text-to-video (T2V) generation using diffusion models have garnered significant attention. However, existing T2V models primarily focus on simple scenes featuring a single object performing a single motion. Challenges arise in scenarios involving multiple objects with distinct motions, often leading to incorrect video-text alignment between subjects and their corresponding m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15259v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15259v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15259v1-abstract-full" style="display: none;"> Recent advancements in text-to-video (T2V) generation using diffusion models have garnered significant attention. However, existing T2V models primarily focus on simple scenes featuring a single object performing a single motion. Challenges arise in scenarios involving multiple objects with distinct motions, often leading to incorrect video-text alignment between subjects and their corresponding motions. To address this challenge, we propose \textbf{S$^2$AG-Vid}, a training-free inference-stage optimization method that improves the alignment of multiple objects with their corresponding motions in T2V models. S$^2$AG-Vid initially applies a spatial position-based, cross-attention (CA) constraint in the early stages of the denoising process, facilitating multiple nouns distinctly attending to the correct subject regions. To enhance the motion-subject binding, we implement a syntax-guided contrastive constraint in the subsequent denoising phase, aimed at improving the correlations between the CA maps of verbs and their corresponding nouns.Both qualitative and quantitative evaluations demonstrate that the proposed framework significantly outperforms baseline approaches, producing higher-quality videos with improved subject-motion consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15259v1-abstract-full').style.display = 'none'; document.getElementById('2409.15259v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13503">arXiv:2409.13503</a> <span> [<a href="https://arxiv.org/pdf/2409.13503">pdf</a>, <a href="https://arxiv.org/format/2409.13503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SatFed: A Resource-Efficient LEO Satellite-Assisted Heterogeneous Federated Learning Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zheng Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zihan Fang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenjun Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jin Zhao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13503v3-abstract-short" style="display: inline;"> Traditional federated learning (FL) frameworks rely heavily on terrestrial networks, where coverage limitations and increasing bandwidth congestion significantly hinder model convergence. Fortunately, the advancement of low-Earth orbit (LEO) satellite networks offers promising new communication avenues to augment traditional terrestrial FL. Despite this potential, the limited satellite-ground comm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13503v3-abstract-full').style.display = 'inline'; document.getElementById('2409.13503v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13503v3-abstract-full" style="display: none;"> Traditional federated learning (FL) frameworks rely heavily on terrestrial networks, where coverage limitations and increasing bandwidth congestion significantly hinder model convergence. Fortunately, the advancement of low-Earth orbit (LEO) satellite networks offers promising new communication avenues to augment traditional terrestrial FL. Despite this potential, the limited satellite-ground communication bandwidth and the heterogeneous operating environments of ground devices-including variations in data, bandwidth, and computing power-pose substantial challenges for effective and robust satellite-assisted FL. To address these challenges, we propose SatFed, a resource-efficient satellite-assisted heterogeneous FL framework. SatFed implements freshness-based model prioritization queues to optimize the use of highly constrained satellite-ground bandwidth, ensuring the transmission of the most critical models. Additionally, a multigraph is constructed to capture real-time heterogeneous relationships between devices, including data distribution, terrestrial bandwidth, and computing capability. This multigraph enables SatFed to aggregate satellite-transmitted models into peer guidance, enhancing local training in heterogeneous environments. Extensive experiments with real-world LEO satellite networks demonstrate that SatFed achieves superior performance and robustness compared to state-of-the-art benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13503v3-abstract-full').style.display = 'none'; document.getElementById('2409.13503v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12190">arXiv:2409.12190</a> <span> [<a href="https://arxiv.org/pdf/2409.12190">pdf</a>, <a href="https://arxiv.org/format/2409.12190">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bundle Adjustment in the Eager Mode </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhan%2C+Z">Zitong Zhan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Huan Xu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zihang Fang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xinpeng Wei</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yaoyu Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12190v1-abstract-short" style="display: inline;"> Bundle adjustment (BA) is a critical technique in various robotic applications, such as simultaneous localization and mapping (SLAM), augmented reality (AR), and photogrammetry. BA optimizes parameters such as camera poses and 3D landmarks to align them with observations. With the growing importance of deep learning in perception systems, there is an increasing need to integrate BA with deep learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12190v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12190v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12190v1-abstract-full" style="display: none;"> Bundle adjustment (BA) is a critical technique in various robotic applications, such as simultaneous localization and mapping (SLAM), augmented reality (AR), and photogrammetry. BA optimizes parameters such as camera poses and 3D landmarks to align them with observations. With the growing importance of deep learning in perception systems, there is an increasing need to integrate BA with deep learning frameworks for enhanced reliability and performance. However, widely-used C++-based BA frameworks, such as GTSAM, g$^2$o, and Ceres, lack native integration with modern deep learning libraries like PyTorch. This limitation affects their flexibility, adaptability, ease of debugging, and overall implementation efficiency. To address this gap, we introduce an eager-mode BA framework seamlessly integrated with PyPose, providing PyTorch-compatible interfaces with high efficiency. Our approach includes GPU-accelerated, differentiable, and sparse operations designed for 2nd-order optimization, Lie group and Lie algebra operations, and linear solvers. Our eager-mode BA on GPU demonstrates substantial runtime efficiency, achieving an average speedup of 18.5$\times$, 22$\times$, and 23$\times$ compared to GTSAM, g$^2$o, and Ceres, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12190v1-abstract-full').style.display = 'none'; document.getElementById('2409.12190v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09371">arXiv:2409.09371</a> <span> [<a href="https://arxiv.org/pdf/2409.09371">pdf</a>, <a href="https://arxiv.org/format/2409.09371">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> WeatherReal: A Benchmark Based on In-Situ Observations for Evaluating Weather Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+W">Weixin Jin</a>, <a href="/search/cs?searchtype=author&query=Weyn%2C+J">Jonathan Weyn</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Pengcheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+S">Siqi Xiang</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+J">Jiang Bian</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zuliang Fang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Haiyu Dong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hongyu Sun</a>, <a href="/search/cs?searchtype=author&query=Thambiratnam%2C+K">Kit Thambiratnam</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09371v1-abstract-short" style="display: inline;"> In recent years, AI-based weather forecasting models have matched or even outperformed numerical weather prediction systems. However, most of these models have been trained and evaluated on reanalysis datasets like ERA5. These datasets, being products of numerical models, often diverge substantially from actual observations in some crucial variables like near-surface temperature, wind, precipitati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09371v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09371v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09371v1-abstract-full" style="display: none;"> In recent years, AI-based weather forecasting models have matched or even outperformed numerical weather prediction systems. However, most of these models have been trained and evaluated on reanalysis datasets like ERA5. These datasets, being products of numerical models, often diverge substantially from actual observations in some crucial variables like near-surface temperature, wind, precipitation and clouds - parameters that hold significant public interest. To address this divergence, we introduce WeatherReal, a novel benchmark dataset for weather forecasting, derived from global near-surface in-situ observations. WeatherReal also features a publicly accessible quality control and evaluation framework. This paper details the sources and processing methodologies underlying the dataset, and further illustrates the advantage of in-situ observations in capturing hyper-local and extreme weather through comparative analyses and case studies. Using WeatherReal, we evaluated several data-driven models and compared them with leading numerical models. Our work aims to advance the AI-based weather forecasting research towards a more application-focused and operation-ready approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09371v1-abstract-full').style.display = 'none'; document.getElementById('2409.09371v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09293">arXiv:2409.09293</a> <span> [<a href="https://arxiv.org/pdf/2409.09293">pdf</a>, <a href="https://arxiv.org/format/2409.09293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Associate Everything Detected: Facilitating Tracking-by-Detection to the Unknown </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zimeng Fang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chao Liang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xue Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shuyuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09293v1-abstract-short" style="display: inline;"> Multi-object tracking (MOT) emerges as a pivotal and highly promising branch in the field of computer vision. Classical closed-vocabulary MOT (CV-MOT) methods aim to track objects of predefined categories. Recently, some open-vocabulary MOT (OV-MOT) methods have successfully addressed the problem of tracking unknown categories. However, we found that the CV-MOT and OV-MOT methods each struggle to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09293v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09293v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09293v1-abstract-full" style="display: none;"> Multi-object tracking (MOT) emerges as a pivotal and highly promising branch in the field of computer vision. Classical closed-vocabulary MOT (CV-MOT) methods aim to track objects of predefined categories. Recently, some open-vocabulary MOT (OV-MOT) methods have successfully addressed the problem of tracking unknown categories. However, we found that the CV-MOT and OV-MOT methods each struggle to excel in the tasks of the other. In this paper, we present a unified framework, Associate Everything Detected (AED), that simultaneously tackles CV-MOT and OV-MOT by integrating with any off-the-shelf detector and supports unknown categories. Different from existing tracking-by-detection MOT methods, AED gets rid of prior knowledge (e.g. motion cues) and relies solely on highly robust feature learning to handle complex trajectories in OV-MOT tasks while keeping excellent performance in CV-MOT tasks. Specifically, we model the association task as a similarity decoding problem and propose a sim-decoder with an association-centric learning mechanism. The sim-decoder calculates similarities in three aspects: spatial, temporal, and cross-clip. Subsequently, association-centric learning leverages these threefold similarities to ensure that the extracted features are appropriate for continuous tracking and robust enough to generalize to unknown categories. Compared with existing powerful OV-MOT and CV-MOT methods, AED achieves superior performance on TAO, SportsMOT, and DanceTrack without any prior knowledge. Our code is available at https://github.com/balabooooo/AED. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09293v1-abstract-full').style.display = 'none'; document.getElementById('2409.09293v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08840">arXiv:2409.08840</a> <span> [<a href="https://arxiv.org/pdf/2409.08840">pdf</a>, <a href="https://arxiv.org/format/2409.08840">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Direct-CP: Directed Collaborative Perception for Connected and Autonomous Vehicles via Proactive Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yihang Tao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Senkang Hu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengru Fang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08840v1-abstract-short" style="display: inline;"> Collaborative perception (CP) leverages visual data from connected and autonomous vehicles (CAV) to enhance an ego vehicle's field of view (FoV). Despite recent progress, current CP methods expand the ego vehicle's 360-degree perceptual range almost equally, which faces two key challenges. Firstly, in areas with uneven traffic distribution, focusing on directions with little traffic offers limited… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08840v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08840v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08840v1-abstract-full" style="display: none;"> Collaborative perception (CP) leverages visual data from connected and autonomous vehicles (CAV) to enhance an ego vehicle's field of view (FoV). Despite recent progress, current CP methods expand the ego vehicle's 360-degree perceptual range almost equally, which faces two key challenges. Firstly, in areas with uneven traffic distribution, focusing on directions with little traffic offers limited benefits. Secondly, under limited communication budgets, allocating excessive bandwidth to less critical directions lowers the perception accuracy in more vital areas. To address these issues, we propose Direct-CP, a proactive and direction-aware CP system aiming at improving CP in specific directions. Our key idea is to enable an ego vehicle to proactively signal its interested directions and readjust its attention to enhance local directional CP performance. To achieve this, we first propose an RSU-aided direction masking mechanism that assists an ego vehicle in identifying vital directions. Additionally, we design a direction-aware selective attention module to wisely aggregate pertinent features based on ego vehicle's directional priorities, communication budget, and the positional data of CAVs. Moreover, we introduce a direction-weighted detection loss (DWLoss) to capture the divergence between directional CP outcomes and the ground truth, facilitating effective model training. Extensive experiments on the V2X-Sim 2.0 dataset demonstrate that our approach achieves 19.8\% higher local perception accuracy in interested directions and 2.5\% higher overall perception accuracy than the state-of-the-art methods in collaborative 3D object detection tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08840v1-abstract-full').style.display = 'none'; document.getElementById('2409.08840v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06206">arXiv:2409.06206</a> <span> [<a href="https://arxiv.org/pdf/2409.06206">pdf</a>, <a href="https://arxiv.org/format/2409.06206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AgileIR: Memory-Efficient Group Shifted Windows Attention for Agile Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+H">Hongyi Cai</a>, <a href="/search/cs?searchtype=author&query=Rahman%2C+M+M">Mohammad Mahdinur Rahman</a>, <a href="/search/cs?searchtype=author&query=Akhtar%2C+M+S">Mohammad Shahid Akhtar</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jie Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jingyu Wu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhili Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06206v1-abstract-short" style="display: inline;"> Image Transformers show a magnificent success in Image Restoration tasks. Nevertheless, most of transformer-based models are strictly bounded by exorbitant memory occupancy. Our goal is to reduce the memory consumption of Swin Transformer and at the same time speed up the model during training process. Thus, we introduce AgileIR, group shifted attention mechanism along with window attention, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06206v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06206v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06206v1-abstract-full" style="display: none;"> Image Transformers show a magnificent success in Image Restoration tasks. Nevertheless, most of transformer-based models are strictly bounded by exorbitant memory occupancy. Our goal is to reduce the memory consumption of Swin Transformer and at the same time speed up the model during training process. Thus, we introduce AgileIR, group shifted attention mechanism along with window attention, which sparsely simplifies the model in architecture. We propose Group Shifted Window Attention (GSWA) to decompose Shift Window Multi-head Self Attention (SW-MSA) and Window Multi-head Self Attention (W-MSA) into groups across their attention heads, contributing to shrinking memory usage in back propagation. In addition to that, we keep shifted window masking and its shifted learnable biases during training, in order to induce the model interacting across windows within the channel. We also re-allocate projection parameters to accelerate attention matrix calculation, which we found a negligible decrease in performance. As a result of experiment, compared with our baseline SwinIR and other efficient quantization models, AgileIR keeps the performance still at 32.20 dB on Set5 evaluation dataset, exceeding other methods with tailor-made efficient methods and saves over 50% memory while a large batch size is employed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06206v1-abstract-full').style.display = 'none'; document.getElementById('2409.06206v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04133">arXiv:2409.04133</a> <span> [<a href="https://arxiv.org/pdf/2409.04133">pdf</a>, <a href="https://arxiv.org/format/2409.04133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Secure Traffic Sign Recognition: An Attention-Enabled Universal Image Inpainting Mechanism against Light Patch Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hangcheng Cao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Longzhi Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guowen Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Ziyang He</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengru Fang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04133v1-abstract-short" style="display: inline;"> Traffic sign recognition systems play a crucial role in assisting drivers to make informed decisions while driving. However, due to the heavy reliance on deep learning technologies, particularly for future connected and autonomous driving, these systems are susceptible to adversarial attacks that pose significant safety risks to both personal and public transportation. Notably, researchers recentl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04133v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04133v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04133v1-abstract-full" style="display: none;"> Traffic sign recognition systems play a crucial role in assisting drivers to make informed decisions while driving. However, due to the heavy reliance on deep learning technologies, particularly for future connected and autonomous driving, these systems are susceptible to adversarial attacks that pose significant safety risks to both personal and public transportation. Notably, researchers recently identified a new attack vector to deceive sign recognition systems: projecting well-designed adversarial light patches onto traffic signs. In comparison with traditional adversarial stickers or graffiti, these emerging light patches exhibit heightened aggression due to their ease of implementation and outstanding stealthiness. To effectively counter this security threat, we propose a universal image inpainting mechanism, namely, SafeSign. It relies on attention-enabled multi-view image fusion to repair traffic signs contaminated by adversarial light patches, thereby ensuring the accurate sign recognition. Here, we initially explore the fundamental impact of malicious light patches on the local and global feature spaces of authentic traffic signs. Then, we design a binary mask-based U-Net image generation pipeline outputting diverse contaminated sign patterns, to provide our image inpainting model with needed training data. Following this, we develop an attention mechanism-enabled neural network to jointly utilize the complementary information from multi-view images to repair contaminated signs. Finally, extensive experiments are conducted to evaluate SafeSign's effectiveness in resisting potential light patch-based attacks, bringing an average accuracy improvement of 54.8% in three widely-used sign recognition models <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04133v1-abstract-full').style.display = 'none'; document.getElementById('2409.04133v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03301">arXiv:2409.03301</a> <span> [<a href="https://arxiv.org/pdf/2409.03301">pdf</a>, <a href="https://arxiv.org/format/2409.03301">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ELO-Rated Sequence Rewards: Advancing Reinforcement Learning Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ju%2C+Q">Qi Ju</a>, <a href="/search/cs?searchtype=author&query=Hei%2C+F">Falin Hei</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhemei Fang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yunfeng Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03301v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) is highly dependent on the meticulous design of the reward function. However, accurately assigning rewards to each state-action pair in Long-Term RL (LTRL) challenges is formidable. Consequently, RL agents are predominantly trained with expert guidance. Drawing on the principles of ordinal utility theory from economics, we propose a novel reward estimation algorithm: EL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03301v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03301v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03301v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) is highly dependent on the meticulous design of the reward function. However, accurately assigning rewards to each state-action pair in Long-Term RL (LTRL) challenges is formidable. Consequently, RL agents are predominantly trained with expert guidance. Drawing on the principles of ordinal utility theory from economics, we propose a novel reward estimation algorithm: ELO-Rating based RL (ERRL). This approach is distinguished by two main features. Firstly, it leverages expert preferences over trajectories instead of cardinal rewards (utilities) to compute the ELO rating of each trajectory as its reward. Secondly, a new reward redistribution algorithm is introduced to mitigate training volatility in the absence of a fixed anchor reward. Our method demonstrates superior performance over several leading baselines in long-term scenarios (extending up to 5000 steps), where conventional RL algorithms falter. Furthermore, we conduct a thorough analysis of how expert preferences affect the outcomes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03301v1-abstract-full').style.display = 'none'; document.getElementById('2409.03301v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02706">arXiv:2409.02706</a> <span> [<a href="https://arxiv.org/pdf/2409.02706">pdf</a>, <a href="https://arxiv.org/format/2409.02706">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Beyond Nash Equilibrium: Achieving Bayesian Perfect Equilibrium with Belief Update Fictitious Play </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ju%2C+Q">Qi Ju</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhemei Fang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yunfeng Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02706v1-abstract-short" style="display: inline;"> In the domain of machine learning and game theory, the quest for Nash Equilibrium (NE) in extensive-form games with incomplete information is challenging yet crucial for enhancing AI's decision-making support under varied scenarios. Traditional Counterfactual Regret Minimization (CFR) techniques excel in navigating towards NE, focusing on scenarios where opponents deploy optimal strategies. Howeve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02706v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02706v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02706v1-abstract-full" style="display: none;"> In the domain of machine learning and game theory, the quest for Nash Equilibrium (NE) in extensive-form games with incomplete information is challenging yet crucial for enhancing AI's decision-making support under varied scenarios. Traditional Counterfactual Regret Minimization (CFR) techniques excel in navigating towards NE, focusing on scenarios where opponents deploy optimal strategies. However, the essence of machine learning in strategic game play extends beyond reacting to optimal moves; it encompasses aiding human decision-making in all circumstances. This includes not only crafting responses to optimal strategies but also recovering from suboptimal decisions and capitalizing on opponents' errors. Herein lies the significance of transitioning from NE to Bayesian Perfect Equilibrium (BPE), which accounts for every possible condition, including the irrationality of opponents. To bridge this gap, we propose Belief Update Fictitious Play (BUFP), which innovatively blends fictitious play with belief to target BPE, a more comprehensive equilibrium concept than NE. Specifically, through adjusting iteration stepsizes, BUFP allows for strategic convergence to both NE and BPE. For instance, in our experiments, BUFP(EF) leverages the stepsize of Extensive Form Fictitious Play (EFFP) to achieve BPE, outperforming traditional CFR by securing a 48.53\% increase in benefits in scenarios characterized by dominated strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02706v1-abstract-full').style.display = 'none'; document.getElementById('2409.02706v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00726">arXiv:2409.00726</a> <span> [<a href="https://arxiv.org/pdf/2409.00726">pdf</a>, <a href="https://arxiv.org/format/2409.00726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LPUWF-LDM: Enhanced Latent Diffusion Model for Precise Late-phase UWF-FA Generation on Limited Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhaojie Fang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guanyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+K">Ke Zhuang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifei Chen</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+R">Ruiquan Ge</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Changmiao Wang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+G">Gangyong Jia</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qing Wu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Juan Ye</a>, <a href="/search/cs?searchtype=author&query=Nuliqiman%2C+M">Maimaiti Nuliqiman</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Peifang Xu</a>, <a href="/search/cs?searchtype=author&query=Elazab%2C+A">Ahmed Elazab</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00726v1-abstract-short" style="display: inline;"> Ultra-Wide-Field Fluorescein Angiography (UWF-FA) enables precise identification of ocular diseases using sodium fluorescein, which can be potentially harmful. Existing research has developed methods to generate UWF-FA from Ultra-Wide-Field Scanning Laser Ophthalmoscopy (UWF-SLO) to reduce the adverse reactions associated with injections. However, these methods have been less effective in producin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00726v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00726v1-abstract-full" style="display: none;"> Ultra-Wide-Field Fluorescein Angiography (UWF-FA) enables precise identification of ocular diseases using sodium fluorescein, which can be potentially harmful. Existing research has developed methods to generate UWF-FA from Ultra-Wide-Field Scanning Laser Ophthalmoscopy (UWF-SLO) to reduce the adverse reactions associated with injections. However, these methods have been less effective in producing high-quality late-phase UWF-FA, particularly in lesion areas and fine details. Two primary challenges hinder the generation of high-quality late-phase UWF-FA: the scarcity of paired UWF-SLO and early/late-phase UWF-FA datasets, and the need for realistic generation at lesion sites and potential blood leakage regions. This study introduces an improved latent diffusion model framework to generate high-quality late-phase UWF-FA from limited paired UWF images. To address the challenges as mentioned earlier, our approach employs a module utilizing Cross-temporal Regional Difference Loss, which encourages the model to focus on the differences between early and late phases. Additionally, we introduce a low-frequency enhanced noise strategy in the diffusion forward process to improve the realism of medical images. To further enhance the mapping capability of the variational autoencoder module, especially with limited datasets, we implement a Gated Convolutional Encoder to extract additional information from conditional images. Our Latent Diffusion Model for Ultra-Wide-Field Late-Phase Fluorescein Angiography (LPUWF-LDM) effectively reconstructs fine details in late-phase UWF-FA and achieves state-of-the-art results compared to other existing methods when working with limited datasets. Our source code is available at: https://github.com/Tinysqua/****. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00726v1-abstract-full').style.display = 'none'; document.getElementById('2409.00726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00146">arXiv:2409.00146</a> <span> [<a href="https://arxiv.org/pdf/2409.00146">pdf</a>, <a href="https://arxiv.org/format/2409.00146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Prioritized Information Bottleneck Theoretic Framework with Distributed Online Learning for Edge Video Analytics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengru Fang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Senkang Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingjing Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yiqin Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00146v1-abstract-short" style="display: inline;"> Collaborative perception systems leverage multiple edge devices, such surveillance cameras or autonomous cars, to enhance sensing quality and eliminate blind spots. Despite their advantages, challenges such as limited channel capacity and data redundancy impede their effectiveness. To address these issues, we introduce the Prioritized Information Bottleneck (PIB) framework for edge video analytics… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00146v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00146v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00146v1-abstract-full" style="display: none;"> Collaborative perception systems leverage multiple edge devices, such surveillance cameras or autonomous cars, to enhance sensing quality and eliminate blind spots. Despite their advantages, challenges such as limited channel capacity and data redundancy impede their effectiveness. To address these issues, we introduce the Prioritized Information Bottleneck (PIB) framework for edge video analytics. This framework prioritizes the shared data based on the signal-to-noise ratio (SNR) and camera coverage of the region of interest (RoI), reducing spatial-temporal data redundancy to transmit only essential information. This strategy avoids the need for video reconstruction at edge servers and maintains low latency. It leverages a deterministic information bottleneck method to extract compact, relevant features, balancing informativeness and communication costs. For high-dimensional data, we apply variational approximations for practical optimization. To reduce communication costs in fluctuating connections, we propose a gate mechanism based on distributed online learning (DOL) to filter out less informative messages and efficiently select edge servers. Moreover, we establish the asymptotic optimality of DOL by proving the sublinearity of their regrets. Compared to five coding methods for image and video compression, PIB improves mean object detection accuracy (MODA) while reducing 17.8% and reduces communication costs by 82.80% under poor channel conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00146v1-abstract-full').style.display = 'none'; document.getElementById('2409.00146v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2408.17047</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17047">arXiv:2408.17047</a> <span> [<a href="https://arxiv.org/pdf/2408.17047">pdf</a>, <a href="https://arxiv.org/format/2408.17047">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> PIB: Prioritized Information Bottleneck Framework for Collaborative Edge Video Analytics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengru Fang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Senkang Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liyan Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yiqin Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17047v1-abstract-short" style="display: inline;"> Collaborative edge sensing systems, particularly in collaborative perception systems in autonomous driving, can significantly enhance tracking accuracy and reduce blind spots with multi-view sensing capabilities. However, their limited channel capacity and the redundancy in sensory data pose significant challenges, affecting the performance of collaborative inference tasks. To tackle these issues,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17047v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17047v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17047v1-abstract-full" style="display: none;"> Collaborative edge sensing systems, particularly in collaborative perception systems in autonomous driving, can significantly enhance tracking accuracy and reduce blind spots with multi-view sensing capabilities. However, their limited channel capacity and the redundancy in sensory data pose significant challenges, affecting the performance of collaborative inference tasks. To tackle these issues, we introduce a Prioritized Information Bottleneck (PIB) framework for collaborative edge video analytics. We first propose a priority-based inference mechanism that jointly considers the signal-to-noise ratio (SNR) and the camera's coverage area of the region of interest (RoI). To enable efficient inference, PIB reduces video redundancy in both spatial and temporal domains and transmits only the essential information for the downstream inference tasks. This eliminates the need to reconstruct videos on the edge server while maintaining low latency. Specifically, it derives compact, task-relevant features by employing the deterministic information bottleneck (IB) method, which strikes a balance between feature informativeness and communication costs. Given the computational challenges caused by IB-based objectives with high-dimensional data, we resort to variational approximations for feasible optimization. Compared to TOCOM-TEM, JPEG, and HEVC, PIB achieves an improvement of up to 15.1\% in mean object detection accuracy (MODA) and reduces communication costs by 66.7% when edge cameras experience poor channel conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17047v1-abstract-full').style.display = 'none'; document.getElementById('2408.17047v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Globecom 2024. Code will be available at https://github.com/fangzr/PIB-Prioritized-Information-Bottleneck-Framework</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16343">arXiv:2408.16343</a> <span> [<a href="https://arxiv.org/pdf/2408.16343">pdf</a>, <a href="https://arxiv.org/format/2408.16343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Toward Robust Early Detection of Alzheimer's Disease via an Integrated Multimodal Learning Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifei Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shenghao Zhu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhaojie Fang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+B">Binfeng Zou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhe Wang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+S">Shuo Chang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+F">Fan Jia</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+F">Feiwei Qin</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jin Fan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yong Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Changmiao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16343v1-abstract-short" style="display: inline;"> Alzheimer's Disease (AD) is a complex neurodegenerative disorder marked by memory loss, executive dysfunction, and personality changes. Early diagnosis is challenging due to subtle symptoms and varied presentations, often leading to misdiagnosis with traditional unimodal diagnostic methods due to their limited scope. This study introduces an advanced multimodal classification model that integrates… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16343v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16343v1-abstract-full" style="display: none;"> Alzheimer's Disease (AD) is a complex neurodegenerative disorder marked by memory loss, executive dysfunction, and personality changes. Early diagnosis is challenging due to subtle symptoms and varied presentations, often leading to misdiagnosis with traditional unimodal diagnostic methods due to their limited scope. This study introduces an advanced multimodal classification model that integrates clinical, cognitive, neuroimaging, and EEG data to enhance diagnostic accuracy. The model incorporates a feature tagger with a tabular data coding architecture and utilizes the TimesBlock module to capture intricate temporal patterns in Electroencephalograms (EEG) data. By employing Cross-modal Attention Aggregation module, the model effectively fuses Magnetic Resonance Imaging (MRI) spatial information with EEG temporal data, significantly improving the distinction between AD, Mild Cognitive Impairment, and Normal Cognition. Simultaneously, we have constructed the first AD classification dataset that includes three modalities: EEG, MRI, and tabular data. Our innovative approach aims to facilitate early diagnosis and intervention, potentially slowing the progression of AD. The source code and our private ADMC dataset are available at https://github.com/JustlfC03/MSTNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16343v1-abstract-full').style.display = 'none'; document.getElementById('2408.16343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15252">arXiv:2408.15252</a> <span> [<a href="https://arxiv.org/pdf/2408.15252">pdf</a>, <a href="https://arxiv.org/format/2408.15252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generative AI on SpectrumNet: An Open Benchmark of Multiband 3D Radio Maps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuhang Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Shuai Jiang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wanjie Lin</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kangjun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Ke Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15252v1-abstract-short" style="display: inline;"> Radio map is an efficient demonstration for visually displaying the wireless signal coverage within a certain region. It has been considered to be increasingly helpful for the future sixth generation (6G) of wireless networks, as wireless nodes are becoming more crowded and complicated. However, the construction of high resolution radio map is very challenging due to the sparse sampling in practic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15252v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15252v1-abstract-full" style="display: none;"> Radio map is an efficient demonstration for visually displaying the wireless signal coverage within a certain region. It has been considered to be increasingly helpful for the future sixth generation (6G) of wireless networks, as wireless nodes are becoming more crowded and complicated. However, the construction of high resolution radio map is very challenging due to the sparse sampling in practical systems. Generative artificial intelligence (AI), which is capable to create synthetic data to fill in gaps in real-world measurements, is an effective technique to construct high precision radio maps. Currently, generative models for radio map construction are trained with two-dimension (2D) single band radio maps in urban scenario, which has poor generalization in diverse terrain scenarios, spectrum bands, and heights. To tackle this problem, we provide a multiband three-dimension (3D) radio map dataset with consideration of terrain and climate information, named SpectrumNet. It is the largest radio map dataset in terms of dimensions and scale, which contains the radio map of 3 spacial dimensions, 5 frequency bands, 11 terrain scenarios, and 3 climate scenarios. We introduce the parameters and settings for the SpectrumNet dataset generation, and evaluate three baseline methods for radio map construction based on the SpectrumNet dataset. Experiments show the necessity of the SpectrumNet dataset for training models with strong generalization in spacial, frequency, and scenario domains. Future works on the SpectrumNet dataset are also discussed, including the dataset expansion and calibration, as well as the extended studies on generative models for radio map construction based on the SpectrumNet dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15252v1-abstract-full').style.display = 'none'; document.getElementById('2408.15252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14520">arXiv:2408.14520</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Towards Graph Prompt Learning: A Survey and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+Q">Qingqing Long</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuchen Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+C">Chen Fang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+W">Wentao Cui</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Z">Zhiyuan Ning</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+M">Meng Xiao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+N">Ning Cao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiao Luo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Lingjun Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Shiyue Jiang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chong Chen</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+X">Xian-Sheng Hua</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuanchun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14520v3-abstract-short" style="display: inline;"> Large-scale "pre-train and prompt learning" paradigms have demonstrated remarkable adaptability, enabling broad applications across diverse domains such as question answering, image recognition, and multimodal retrieval. This approach fully leverages the potential of large-scale pre-trained models, reducing downstream data requirements and computational costs while enhancing model applicability ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14520v3-abstract-full').style.display = 'inline'; document.getElementById('2408.14520v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14520v3-abstract-full" style="display: none;"> Large-scale "pre-train and prompt learning" paradigms have demonstrated remarkable adaptability, enabling broad applications across diverse domains such as question answering, image recognition, and multimodal retrieval. This approach fully leverages the potential of large-scale pre-trained models, reducing downstream data requirements and computational costs while enhancing model applicability across various tasks. Graphs, as versatile data structures that capture relationships between entities, play pivotal roles in fields such as social network analysis, recommender systems, and biological graphs. Despite the success of pre-train and prompt learning paradigms in Natural Language Processing (NLP) and Computer Vision (CV), their application in graph domains remains nascent. In graph-structured data, not only do the node and edge features often have disparate distributions, but the topological structures also differ significantly. This diversity in graph data can lead to incompatible patterns or gaps between pre-training and fine-tuning on downstream graphs. We aim to bridge this gap by summarizing methods for alleviating these disparities. This includes exploring prompt design methodologies, comparing related techniques, assessing application scenarios and datasets, and identifying unresolved problems and challenges. This survey categorizes over 100 relevant works in this field, summarizing general design principles and the latest applications, including text-attributed graphs, molecules, proteins, and recommendation systems. Through this extensive review, we provide a foundational understanding of graph prompt learning, aiming to impact not only the graph mining community but also the broader Artificial General Intelligence (AGI) community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14520v3-abstract-full').style.display = 'none'; document.getElementById('2408.14520v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">I have decided to temporarily withdraw this draft as I am in the process of making further revisions to improve its content</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10608">arXiv:2408.10608</a> <span> [<a href="https://arxiv.org/pdf/2408.10608">pdf</a>, <a href="https://arxiv.org/format/2408.10608">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Promoting Equality in Large Language Models: Identifying and Mitigating the Implicit Bias based on Bayesian Theory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yongxin Deng</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xihe Qiu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xiaoyu Tan</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jing Pan</a>, <a href="/search/cs?searchtype=author&query=Jue%2C+C">Chen Jue</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhijun Fang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yinghui Xu</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+W">Wei Chu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuan Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10608v1-abstract-short" style="display: inline;"> Large language models (LLMs) are trained on extensive text corpora, which inevitably include biased information. Although techniques such as Affective Alignment can mitigate some negative impacts of these biases, existing prompt-based attack methods can still extract these biases from the model's weights. Moreover, these biases frequently appear subtly when LLMs are prompted to perform identical t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10608v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10608v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10608v1-abstract-full" style="display: none;"> Large language models (LLMs) are trained on extensive text corpora, which inevitably include biased information. Although techniques such as Affective Alignment can mitigate some negative impacts of these biases, existing prompt-based attack methods can still extract these biases from the model's weights. Moreover, these biases frequently appear subtly when LLMs are prompted to perform identical tasks across different demographic groups, thereby camouflaging their presence. To address this issue, we have formally defined the implicit bias problem and developed an innovative framework for bias removal based on Bayesian theory, Bayesian-Theory based Bias Removal (BTBR). BTBR employs likelihood ratio screening to pinpoint data entries within publicly accessible biased datasets that represent biases inadvertently incorporated during the LLM training phase. It then automatically constructs relevant knowledge triples and expunges bias information from LLMs using model editing techniques. Through extensive experimentation, we have confirmed the presence of the implicit bias problem in LLMs and demonstrated the effectiveness of our BTBR approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10608v1-abstract-full').style.display = 'none'; document.getElementById('2408.10608v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05802">arXiv:2408.05802</a> <span> [<a href="https://arxiv.org/pdf/2408.05802">pdf</a>, <a href="https://arxiv.org/format/2408.05802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Egocentric Vision Language Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhirui Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming Yang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Weishuai Zeng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyu Li</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+J">Junpeng Yue</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Ziluo Ding</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiu Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zongqing Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05802v1-abstract-short" style="display: inline;"> We explore leveraging large multi-modal models (LMMs) and text2image models to build a more general embodied agent. LMMs excel in planning long-horizon tasks over symbolic abstractions but struggle with grounding in the physical world, often failing to accurately identify object positions in images. A bridge is needed to connect LMMs to the physical world. The paper proposes a novel approach, egoc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05802v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05802v1-abstract-full" style="display: none;"> We explore leveraging large multi-modal models (LMMs) and text2image models to build a more general embodied agent. LMMs excel in planning long-horizon tasks over symbolic abstractions but struggle with grounding in the physical world, often failing to accurately identify object positions in images. A bridge is needed to connect LMMs to the physical world. The paper proposes a novel approach, egocentric vision language planning (EgoPlan), to handle long-horizon tasks from an egocentric perspective in varying household scenarios. This model leverages a diffusion model to simulate the fundamental dynamics between states and actions, integrating techniques like style transfer and optical flow to enhance generalization across different environmental dynamics. The LMM serves as a planner, breaking down instructions into sub-goals and selecting actions based on their alignment with these sub-goals, thus enabling more generalized and effective decision-making. Experiments show that EgoPlan improves long-horizon task success rates from the egocentric view compared to baselines across household scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05802v1-abstract-full').style.display = 'none'; document.getElementById('2408.05802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05452">arXiv:2408.05452</a> <span> [<a href="https://arxiv.org/pdf/2408.05452">pdf</a>, <a href="https://arxiv.org/format/2408.05452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> EV-MGDispNet: Motion-Guided Event-Based Stereo Disparity Estimation Network with Left-Right Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Junjie Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+H">Hao Zhuang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xinjie Huang</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+D">Delei Kong</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05452v1-abstract-short" style="display: inline;"> Event cameras have the potential to revolutionize the field of robot vision, particularly in areas like stereo disparity estimation, owing to their high temporal resolution and high dynamic range. Many studies use deep learning for event camera stereo disparity estimation. However, these methods fail to fully exploit the temporal information in the event stream to acquire clear event representatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05452v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05452v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05452v1-abstract-full" style="display: none;"> Event cameras have the potential to revolutionize the field of robot vision, particularly in areas like stereo disparity estimation, owing to their high temporal resolution and high dynamic range. Many studies use deep learning for event camera stereo disparity estimation. However, these methods fail to fully exploit the temporal information in the event stream to acquire clear event representations. Additionally, there is room for further reduction in pixel shifts in the feature maps before constructing the cost volume. In this paper, we propose EV-MGDispNet, a novel event-based stereo disparity estimation method. Firstly, we propose an edge-aware aggregation (EAA) module, which fuses event frames and motion confidence maps to generate a novel clear event representation. Then, we propose a motion-guided attention (MGA) module, where motion confidence maps utilize deformable transformer encoders to enhance the feature map with more accurate edges. Finally, we also add a census left-right consistency loss function to enhance the left-right consistency of stereo event representation. Through conducting experiments within challenging real-world driving scenarios, we validate that our method outperforms currently known state-of-the-art methods in terms of mean absolute error (MAE) and root mean square error (RMSE) metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05452v1-abstract-full').style.display = 'none'; document.getElementById('2408.05452v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03624">arXiv:2408.03624</a> <span> [<a href="https://arxiv.org/pdf/2408.03624">pdf</a>, <a href="https://arxiv.org/format/2408.03624">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AgentsCoMerge: Large Language Model Empowered Collaborative Decision Making for Ramp Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+S">Senkang Hu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengru Fang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zihan Fang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yiqin Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a>, <a href="/search/cs?searchtype=author&query=Kwong%2C+S">Sam Kwong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03624v1-abstract-short" style="display: inline;"> Ramp merging is one of the bottlenecks in traffic systems, which commonly cause traffic congestion, accidents, and severe carbon emissions. In order to address this essential issue and enhance the safety and efficiency of connected and autonomous vehicles (CAVs) at multi-lane merging zones, we propose a novel collaborative decision-making framework, named AgentsCoMerge, to leverage large language… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03624v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03624v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03624v1-abstract-full" style="display: none;"> Ramp merging is one of the bottlenecks in traffic systems, which commonly cause traffic congestion, accidents, and severe carbon emissions. In order to address this essential issue and enhance the safety and efficiency of connected and autonomous vehicles (CAVs) at multi-lane merging zones, we propose a novel collaborative decision-making framework, named AgentsCoMerge, to leverage large language models (LLMs). Specifically, we first design a scene observation and understanding module to allow an agent to capture the traffic environment. Then we propose a hierarchical planning module to enable the agent to make decisions and plan trajectories based on the observation and the agent's own state. In addition, in order to facilitate collaboration among multiple agents, we introduce a communication module to enable the surrounding agents to exchange necessary information and coordinate their actions. Finally, we develop a reinforcement reflection guided training paradigm to further enhance the decision-making capability of the framework. Extensive experiments are conducted to evaluate the performance of our proposed method, demonstrating its superior efficiency and effectiveness for multi-agent collaborative decision-making under various ramp merging scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03624v1-abstract-full').style.display = 'none'; document.getElementById('2408.03624v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02450">arXiv:2408.02450</a> <span> [<a href="https://arxiv.org/pdf/2408.02450">pdf</a>, <a href="https://arxiv.org/format/2408.02450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> An Evaluation of Requirements Modeling for Cyber-Physical Systems via LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+D">Dongming Jin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shengxin Zhao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhi Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaohong Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chunhui Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Hongbin Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02450v1-abstract-short" style="display: inline;"> Cyber-physical systems (CPSs) integrate cyber and physical components and enable them to interact with each other to meet user needs. The needs for CPSs span rich application domains such as healthcare and medicine, smart home, smart building, etc. This indicates that CPSs are all about solving real-world problems. With the increasing abundance of sensing devices and effectors, the problems wanted… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02450v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02450v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02450v1-abstract-full" style="display: none;"> Cyber-physical systems (CPSs) integrate cyber and physical components and enable them to interact with each other to meet user needs. The needs for CPSs span rich application domains such as healthcare and medicine, smart home, smart building, etc. This indicates that CPSs are all about solving real-world problems. With the increasing abundance of sensing devices and effectors, the problems wanted to solve with CPSs are becoming more and more complex. It is also becoming increasingly difficult to extract and express CPS requirements accurately. Problem frame approach aims to shape real-world problems by capturing the characteristics and interconnections of components, where the problem diagram is central to expressing the requirements. CPSs requirements are generally presented in domain-specific documents that are normally expressed in natural language. There is currently no effective way to extract problem diagrams from natural language documents. CPSs requirements extraction and modeling are generally done manually, which is time-consuming, labor-intensive, and error-prone. Large language models (LLMs) have shown excellent performance in natural language understanding. It can be interesting to explore the abilities of LLMs to understand domain-specific documents and identify modeling elements, which this paper is working on. To achieve this goal, we first formulate two tasks (i.e., entity recognition and interaction extraction) and propose a benchmark called CPSBench. Based on this benchmark, extensive experiments are conducted to evaluate the abilities and limitations of seven advanced LLMs. We find some interesting insights. Finally, we establish a taxonomy of LLMs hallucinations in CPSs requirements modeling using problem diagrams. These results will inspire research on the use of LLMs for automated CPSs requirements modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02450v1-abstract-full').style.display = 'none'; document.getElementById('2408.02450v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01928">arXiv:2408.01928</a> <span> [<a href="https://arxiv.org/pdf/2408.01928">pdf</a>, <a href="https://arxiv.org/format/2408.01928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> A Semi-supervised Multi-channel Graph Convolutional Network for Query Classification in E-commerce </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chunyuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+M">Ming Pang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xue Jiang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Changping Peng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhangang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01928v1-abstract-short" style="display: inline;"> Query intent classification is an essential module for customers to find desired products on the e-commerce application quickly. Most existing query intent classification methods rely on the users' click behavior as a supervised signal to construct training samples. However, these methods based entirely on posterior labels may lead to serious category imbalance problems because of the Matthew effe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01928v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01928v1-abstract-full" style="display: none;"> Query intent classification is an essential module for customers to find desired products on the e-commerce application quickly. Most existing query intent classification methods rely on the users' click behavior as a supervised signal to construct training samples. However, these methods based entirely on posterior labels may lead to serious category imbalance problems because of the Matthew effect in click samples. Compared with popular categories, it is difficult for products under long-tail categories to obtain traffic and user clicks, which makes the models unable to detect users' intent for products under long-tail categories. This in turn aggravates the problem that long-tail categories cannot obtain traffic, forming a vicious circle. In addition, due to the randomness of the user's click, the posterior label is unstable for the query with similar semantics, which makes the model very sensitive to the input, leading to an unstable and incomplete recall of categories. In this paper, we propose a novel Semi-supervised Multi-channel Graph Convolutional Network (SMGCN) to address the above problems from the perspective of label association and semi-supervised learning. SMGCN extends category information and enhances the posterior label by utilizing the similarity score between the query and categories. Furthermore, it leverages the co-occurrence and semantic similarity graph of categories to strengthen the relations among labels and weaken the influence of posterior label instability. We conduct extensive offline and online A/B experiments, and the experimental results show that SMGCN significantly outperforms the strong baselines, which shows its effectiveness and practicality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01928v1-abstract-full').style.display = 'none'; document.getElementById('2408.01928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00860">arXiv:2408.00860</a> <span> [<a href="https://arxiv.org/pdf/2408.00860">pdf</a>, <a href="https://arxiv.org/format/2408.00860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> UlRe-NeRF: 3D Ultrasound Imaging through Neural Rendering with Ultrasound Reflection Direction Parameterization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Ziwen Guo</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zi Fang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Z">Zhuang Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00860v3-abstract-short" style="display: inline;"> Three-dimensional ultrasound imaging is a critical technology widely used in medical diagnostics. However, traditional 3D ultrasound imaging methods have limitations such as fixed resolution, low storage efficiency, and insufficient contextual connectivity, leading to poor performance in handling complex artifacts and reflection characteristics. Recently, techniques based on NeRF (Neural Radiance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00860v3-abstract-full').style.display = 'inline'; document.getElementById('2408.00860v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00860v3-abstract-full" style="display: none;"> Three-dimensional ultrasound imaging is a critical technology widely used in medical diagnostics. However, traditional 3D ultrasound imaging methods have limitations such as fixed resolution, low storage efficiency, and insufficient contextual connectivity, leading to poor performance in handling complex artifacts and reflection characteristics. Recently, techniques based on NeRF (Neural Radiance Fields) have made significant progress in view synthesis and 3D reconstruction, but there remains a research gap in high-quality ultrasound imaging. To address these issues, we propose a new model, UlRe-NeRF, which combines implicit neural networks and explicit ultrasound volume rendering into an ultrasound neural rendering architecture. This model incorporates reflection direction parameterization and harmonic encoding, using a directional MLP module to generate view-dependent high-frequency reflection intensity estimates, and a spatial MLP module to produce the medium's physical property parameters. These parameters are used in the volume rendering process to accurately reproduce the propagation and reflection behavior of ultrasound waves in the medium. Experimental results demonstrate that the UlRe-NeRF model significantly enhances the realism and accuracy of high-fidelity ultrasound image reconstruction, especially in handling complex medium structures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00860v3-abstract-full').style.display = 'none'; document.getElementById('2408.00860v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20741">arXiv:2407.20741</a> <span> [<a href="https://arxiv.org/pdf/2407.20741">pdf</a>, <a href="https://arxiv.org/format/2407.20741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Improving PINNs By Algebraic Inclusion of Boundary and Initial Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mohan Ren</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhihao Fang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Keren Li</a>, <a href="/search/cs?searchtype=author&query=Mukherjee%2C+A">Anirbit Mukherjee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20741v1-abstract-short" style="display: inline;"> "AI for Science" aims to solve fundamental scientific problems using AI techniques. As most physical phenomena can be described as Partial Differential Equations (PDEs) , approximating their solutions using neural networks has evolved as a central component of scientific-ML. Physics-Informed Neural Networks (PINNs) is the general method that has evolved for this task but its training is well-known… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20741v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20741v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20741v1-abstract-full" style="display: none;"> "AI for Science" aims to solve fundamental scientific problems using AI techniques. As most physical phenomena can be described as Partial Differential Equations (PDEs) , approximating their solutions using neural networks has evolved as a central component of scientific-ML. Physics-Informed Neural Networks (PINNs) is the general method that has evolved for this task but its training is well-known to be very unstable. In this work we explore the possibility of changing the model being trained from being just a neural network to being a non-linear transformation of it - one that algebraically includes the boundary/initial conditions. This reduces the number of terms in the loss function than the standard PINN losses. We demonstrate that our modification leads to significant performance gains across a range of benchmark tasks, in various dimensions and without having to tweak the training algorithm. Our conclusions are based on conducting hundreds of experiments, in the fully unsupervised setting, over multiple linear and non-linear PDEs set to exactly solvable scenarios, which lends to a concrete measurement of our performance gains in terms of order(s) of magnitude lower fractional errors being achieved, than by standard PINNs. The code accompanying this manuscript is publicly available at, https://github.com/MorganREN/Improving-PINNs-By-Algebraic-Inclusion-of-Boundary-and-Initial-Conditions <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20741v1-abstract-full').style.display = 'none'; document.getElementById('2407.20741v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">48 Pages, 25 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19507">arXiv:2407.19507</a> <span> [<a href="https://arxiv.org/pdf/2407.19507">pdf</a>, <a href="https://arxiv.org/format/2407.19507">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> WeCromCL: Weakly Supervised Cross-Modality Contrastive Learning for Transcription-only Supervised Text Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jingjing Wu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengyao Fang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+P">Pengyuan Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengquan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Fanglin Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+G">Guangming Lu</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+W">Wenjie Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19507v1-abstract-short" style="display: inline;"> Transcription-only Supervised Text Spotting aims to learn text spotters relying only on transcriptions but no text boundaries for supervision, thus eliminating expensive boundary annotation. The crux of this task lies in locating each transcription in scene text images without location annotations. In this work, we formulate this challenging problem as a Weakly Supervised Cross-modality Contrastiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19507v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19507v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19507v1-abstract-full" style="display: none;"> Transcription-only Supervised Text Spotting aims to learn text spotters relying only on transcriptions but no text boundaries for supervision, thus eliminating expensive boundary annotation. The crux of this task lies in locating each transcription in scene text images without location annotations. In this work, we formulate this challenging problem as a Weakly Supervised Cross-modality Contrastive Learning problem, and design a simple yet effective model dubbed WeCromCL that is able to detect each transcription in a scene image in a weakly supervised manner. Unlike typical methods for cross-modality contrastive learning that focus on modeling the holistic semantic correlation between an entire image and a text description, our WeCromCL conducts atomistic contrastive learning to model the character-wise appearance consistency between a text transcription and its correlated region in a scene image to detect an anchor point for the transcription in a weakly supervised manner. The detected anchor points by WeCromCL are further used as pseudo location labels to guide the learning of text spotting. Extensive experiments on four challenging benchmarks demonstrate the superior performance of our model over other methods. Code will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19507v1-abstract-full').style.display = 'none'; document.getElementById('2407.19507v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19079">arXiv:2407.19079</a> <span> [<a href="https://arxiv.org/pdf/2407.19079">pdf</a>, <a href="https://arxiv.org/format/2407.19079">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UniForensics: Face Forgery Detection via General Facial Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Ziyuan Fang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanqing Zhao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+T">Tianyi Wei</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenbo Zhou</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+M">Ming Wan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhanyi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+N">Nenghai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19079v1-abstract-short" style="display: inline;"> Previous deepfake detection methods mostly depend on low-level textural features vulnerable to perturbations and fall short of detecting unseen forgery methods. In contrast, high-level semantic features are less susceptible to perturbations and not limited to forgery-specific artifacts, thus having stronger generalization. Motivated by this, we propose a detection method that utilizes high-level s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19079v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19079v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19079v1-abstract-full" style="display: none;"> Previous deepfake detection methods mostly depend on low-level textural features vulnerable to perturbations and fall short of detecting unseen forgery methods. In contrast, high-level semantic features are less susceptible to perturbations and not limited to forgery-specific artifacts, thus having stronger generalization. Motivated by this, we propose a detection method that utilizes high-level semantic features of faces to identify inconsistencies in temporal domain. We introduce UniForensics, a novel deepfake detection framework that leverages a transformer-based video classification network, initialized with a meta-functional face encoder for enriched facial representation. In this way, we can take advantage of both the powerful spatio-temporal model and the high-level semantic information of faces. Furthermore, to leverage easily accessible real face data and guide the model in focusing on spatio-temporal features, we design a Dynamic Video Self-Blending (DVSB) method to efficiently generate training samples with diverse spatio-temporal forgery traces using real facial videos. Based on this, we advance our framework with a two-stage training approach: The first stage employs a novel self-supervised contrastive learning, where we encourage the network to focus on forgery traces by impelling videos generated by the same forgery process to have similar representations. On the basis of the representation learned in the first stage, the second stage involves fine-tuning on face forgery detection dataset to build a deepfake detector. Extensive experiments validates that UniForensics outperforms existing face forgery methods in generalization ability and robustness. In particular, our method achieves 95.3\% and 77.2\% cross dataset AUC on the challenging Celeb-DFv2 and DFDC respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19079v1-abstract-full').style.display = 'none'; document.getElementById('2407.19079v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18175">arXiv:2407.18175</a> <span> [<a href="https://arxiv.org/pdf/2407.18175">pdf</a>, <a href="https://arxiv.org/format/2407.18175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Quasar-ViT: Hardware-Oriented Quantization-Aware Architecture Search for Vision Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengang Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+A">Alec Lu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yanyue Xie</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Z">Zhenglun Kong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mengshu Sun</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hao Tang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Z+J">Zhong Jia Xue</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+P">Peiyan Dong</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Caiwen Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xue Lin</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhenman Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18175v1-abstract-short" style="display: inline;"> Vision transformers (ViTs) have demonstrated their superior accuracy for computer vision tasks compared to convolutional neural networks (CNNs). However, ViT models are often computation-intensive for efficient deployment on resource-limited edge devices. This work proposes Quasar-ViT, a hardware-oriented quantization-aware architecture search framework for ViTs, to design efficient ViT models for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18175v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18175v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18175v1-abstract-full" style="display: none;"> Vision transformers (ViTs) have demonstrated their superior accuracy for computer vision tasks compared to convolutional neural networks (CNNs). However, ViT models are often computation-intensive for efficient deployment on resource-limited edge devices. This work proposes Quasar-ViT, a hardware-oriented quantization-aware architecture search framework for ViTs, to design efficient ViT models for hardware implementation while preserving the accuracy. First, Quasar-ViT trains a supernet using our row-wise flexible mixed-precision quantization scheme, mixed-precision weight entanglement, and supernet layer scaling techniques. Then, it applies an efficient hardware-oriented search algorithm, integrated with hardware latency and resource modeling, to determine a series of optimal subnets from supernet under different inference latency targets. Finally, we propose a series of model-adaptive designs on the FPGA platform to support the architecture search and mitigate the gap between the theoretical computation reduction and the practical inference speedup. Our searched models achieve 101.5, 159.6, and 251.6 frames-per-second (FPS) inference speed on the AMD/Xilinx ZCU102 FPGA with 80.4%, 78.6%, and 74.9% top-1 accuracy, respectively, for the ImageNet dataset, consistently outperforming prior works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18175v1-abstract-full').style.display = 'none'; document.getElementById('2407.18175v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17905">arXiv:2407.17905</a> <span> [<a href="https://arxiv.org/pdf/2407.17905">pdf</a>, <a href="https://arxiv.org/format/2407.17905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> StreamMOS: Streaming Moving Object Segmentation with Multi-View Perception and Dual-Span Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiheng Li</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yubo Cui</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+J">Jiexi Zhong</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17905v1-abstract-short" style="display: inline;"> Moving object segmentation based on LiDAR is a crucial and challenging task for autonomous driving and mobile robotics. Most approaches explore spatio-temporal information from LiDAR sequences to predict moving objects in the current frame. However, they often focus on transferring temporal cues in a single inference and regard every prediction as independent of others. This may cause inconsistent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17905v1-abstract-full').style.display = 'inline'; document.getElementById('2407.17905v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17905v1-abstract-full" style="display: none;"> Moving object segmentation based on LiDAR is a crucial and challenging task for autonomous driving and mobile robotics. Most approaches explore spatio-temporal information from LiDAR sequences to predict moving objects in the current frame. However, they often focus on transferring temporal cues in a single inference and regard every prediction as independent of others. This may cause inconsistent segmentation results for the same object in different frames. To overcome this issue, we propose a streaming network with a memory mechanism, called StreamMOS, to build the association of features and predictions among multiple inferences. Specifically, we utilize a short-term memory to convey historical features, which can be regarded as spatial prior of moving objects and adopted to enhance current inference by temporal fusion. Meanwhile, we build a long-term memory to store previous predictions and exploit them to refine the present forecast at voxel and instance levels through voting. Besides, we present multi-view encoder with cascade projection and asymmetric convolution to extract motion feature of objects in different representations. Extensive experiments validate that our algorithm gets competitive performance on SemanticKITTI and Sipailou Campus datasets. Code will be released at https://github.com/NEU-REAL/StreamMOS.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17905v1-abstract-full').style.display = 'none'; document.getElementById('2407.17905v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15719">arXiv:2407.15719</a> <span> [<a href="https://arxiv.org/pdf/2407.15719">pdf</a>, <a href="https://arxiv.org/format/2407.15719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GFE-Mamba: Mamba-based AD Multi-modal Progression Assessment via Generative Feature Extraction from MCI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhaojie Fang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shenghao Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifei Chen</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+B">Binfeng Zou</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+F">Fan Jia</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Linwei Qiu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiyu Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xiang Feng</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+F">Feiwei Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Changmiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yeru Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jin Fan</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+C">Changbiao Chu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wan-Zhen Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hu Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15719v1-abstract-short" style="display: inline;"> Alzheimer's Disease (AD) is an irreversible neurodegenerative disorder that often progresses from Mild Cognitive Impairment (MCI), leading to memory loss and significantly impacting patients' lives. Clinical trials indicate that early targeted interventions for MCI patients can potentially slow or halt the development and progression of AD. Previous research has shown that accurate medical classif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15719v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15719v1-abstract-full" style="display: none;"> Alzheimer's Disease (AD) is an irreversible neurodegenerative disorder that often progresses from Mild Cognitive Impairment (MCI), leading to memory loss and significantly impacting patients' lives. Clinical trials indicate that early targeted interventions for MCI patients can potentially slow or halt the development and progression of AD. Previous research has shown that accurate medical classification requires the inclusion of extensive multimodal data, such as assessment scales and various neuroimaging techniques like Magnetic Resonance Imaging (MRI) and Positron Emission Tomography (PET). However, consistently tracking the diagnosis of the same individual over time and simultaneously collecting multimodal data poses significant challenges. To address this issue, we introduce GFE-Mamba, a classifier based on Generative Feature Extraction (GFE). This classifier effectively integrates data from assessment scales, MRI, and PET, enabling deeper multimodal fusion. It efficiently extracts both long and short sequence information and incorporates additional information beyond the pixel space. This approach not only improves classification accuracy but also enhances the interpretability and stability of the model. We constructed datasets of over 3000 samples based on the Alzheimer's Disease Neuroimaging Initiative (ADNI) for a two-step training process. Our experimental results demonstrate that the GFE-Mamba model is effective in predicting the conversion from MCI to AD and outperforms several state-of-the-art methods. Our source code and ADNI dataset processing code are available at https://github.com/Tinysqua/GFE-Mamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15719v1-abstract-full').style.display = 'none'; document.getElementById('2407.15719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09854">arXiv:2407.09854</a> <span> [<a href="https://arxiv.org/pdf/2407.09854">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> </div> </div> <p class="title is-5 mathjax"> Science cited in policy documents: Evidence from the Overton database </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhichao Fang</a>, <a href="/search/cs?searchtype=author&query=Dudek%2C+J">Jonathan Dudek</a>, <a href="/search/cs?searchtype=author&query=Noyons%2C+E">Ed Noyons</a>, <a href="/search/cs?searchtype=author&query=Costas%2C+R">Rodrigo Costas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09854v1-abstract-short" style="display: inline;"> To reflect the extent to which science is cited in policy documents, this paper explores the presence of policy document citations for over 18 million Web of Science-indexed publications published between 2010 and 2019. Enabled by the policy document citation data provided by Overton, a searchable index of policy documents worldwide, the results show that there are 3.9% of publications in the data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09854v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09854v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09854v1-abstract-full" style="display: none;"> To reflect the extent to which science is cited in policy documents, this paper explores the presence of policy document citations for over 18 million Web of Science-indexed publications published between 2010 and 2019. Enabled by the policy document citation data provided by Overton, a searchable index of policy documents worldwide, the results show that there are 3.9% of publications in the dataset cited at least once by policy documents. Policy document citations present a citation delay towards newly published publications and show a stronger predominance to the document types of review and article. Based on the Overton database, publications in the field of Social Sciences and Humanities have the highest relative presence in policy document citations, followed by Life and Earth Sciences and Biomedical and Health Sciences. Our findings shed light not only on the impact of scientific knowledge on the policy-making process, but also on the particular focus of policy documents indexed by Overton on specific research areas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09854v1-abstract-full').style.display = 'none'; document.getElementById('2407.09854v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 2020 Altmetric Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08517">arXiv:2407.08517</a> <span> [<a href="https://arxiv.org/pdf/2407.08517">pdf</a>, <a href="https://arxiv.org/format/2407.08517">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generalized Low-Rank Matrix Completion Model with Overlapping Group Error Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wenjing Lu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhuang Fang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Liang Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+L">Liming Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hanxin Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Chuanjiang He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08517v2-abstract-short" style="display: inline;"> The low-rank matrix completion (LRMC) technology has achieved remarkable results in low-level visual tasks. There is an underlying assumption that the real-world matrix data is low-rank in LRMC. However, the real matrix data does not satisfy the strict low-rank property, which undoubtedly present serious challenges for the above-mentioned matrix recovery methods. Fortunately, there are feasible sc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08517v2-abstract-full').style.display = 'inline'; document.getElementById('2407.08517v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08517v2-abstract-full" style="display: none;"> The low-rank matrix completion (LRMC) technology has achieved remarkable results in low-level visual tasks. There is an underlying assumption that the real-world matrix data is low-rank in LRMC. However, the real matrix data does not satisfy the strict low-rank property, which undoubtedly present serious challenges for the above-mentioned matrix recovery methods. Fortunately, there are feasible schemes that devise appropriate and effective priori representations for describing the intrinsic information of real data. In this paper, we firstly model the matrix data ${\bf{Y}}$ as the sum of a low-rank approximation component $\bf{X}$ and an approximation error component $\cal{E}$. This finer-grained data decomposition architecture enables each component of information to be portrayed more precisely. Further, we design an overlapping group error representation (OGER) function to characterize the above error structure and propose a generalized low-rank matrix completion model based on OGER. Specifically, the low-rank component describes the global structure information of matrix data, while the OGER component not only compensates for the approximation error between the low-rank component and the real data but also better captures the local block sparsity information of matrix data. Finally, we develop an alternating direction method of multipliers (ADMM) that integrates the majorization-minimization (MM) algorithm, which enables the efficient solution of the proposed model. And we analyze the convergence of the algorithm in detail both theoretically and experimentally. In addition, the results of numerical experiments demonstrate that the proposed model outperforms existing competing models in performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08517v2-abstract-full').style.display = 'none'; document.getElementById('2407.08517v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08498">arXiv:2407.08498</a> <span> [<a href="https://arxiv.org/pdf/2407.08498">pdf</a>, <a href="https://arxiv.org/format/2407.08498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ERD: Exponential Retinex decomposition based on weak space and hybrid nonconvex regularization and its denoising application </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+L">Liang Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wenjing Lu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+L">Liming Tang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhuang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08498v2-abstract-short" style="display: inline;"> The Retinex theory models the image as a product of illumination and reflection components, which has received extensive attention and is widely used in image enhancement, segmentation and color restoration. However, it has been rarely used in additive noise removal due to the inclusion of both multiplication and addition operations in the Retinex noisy image modeling. In this paper, we propose an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08498v2-abstract-full').style.display = 'inline'; document.getElementById('2407.08498v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08498v2-abstract-full" style="display: none;"> The Retinex theory models the image as a product of illumination and reflection components, which has received extensive attention and is widely used in image enhancement, segmentation and color restoration. However, it has been rarely used in additive noise removal due to the inclusion of both multiplication and addition operations in the Retinex noisy image modeling. In this paper, we propose an exponential Retinex decomposition model based on hybrid non-convex regularization and weak space oscillation-modeling for image denoising. The proposed model utilizes non-convex first-order total variation (TV) and non-convex second-order TV to regularize the reflection component and the illumination component, respectively, and employs weak $H^{-1}$ norm to measure the residual component. By utilizing different regularizers, the proposed model effectively decomposes the image into reflection, illumination, and noise components. An alternating direction multipliers method (ADMM) combined with the Majorize-Minimization (MM) algorithm is developed to solve the proposed model. Furthermore, we provide a detailed proof of the convergence property of the algorithm. Numerical experiments validate both the proposed model and algorithm. Compared with several state-of-the-art denoising models, the proposed model exhibits superior performance in terms of peak signal-to-noise ratio (PSNR) and mean structural similarity (MSSIM). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08498v2-abstract-full').style.display = 'none'; document.getElementById('2407.08498v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06317">arXiv:2407.06317</a> <span> [<a href="https://arxiv.org/pdf/2407.06317">pdf</a>, <a href="https://arxiv.org/format/2407.06317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Safety in Autonomous Driving: Integrating Latent State Diffusion Model for End-to-End Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+D">Detian Chu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Linyuan Bai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianuo Huang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhenlong Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peng Zhang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+W">Wei Kang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haifeng Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06317v4-abstract-short" style="display: inline;"> With the advancement of autonomous driving, ensuring safety during motion planning and navigation is becoming more and more important. However, most end-to-end planning methods suffer from a lack of safety. This research addresses the safety issue in the control optimization problem of autonomous driving, formulated as Constrained Markov Decision Processes (CMDPs). We propose a novel, model-based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06317v4-abstract-full').style.display = 'inline'; document.getElementById('2407.06317v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06317v4-abstract-full" style="display: none;"> With the advancement of autonomous driving, ensuring safety during motion planning and navigation is becoming more and more important. However, most end-to-end planning methods suffer from a lack of safety. This research addresses the safety issue in the control optimization problem of autonomous driving, formulated as Constrained Markov Decision Processes (CMDPs). We propose a novel, model-based approach for policy optimization, utilizing a conditional Value-at-Risk based Soft Actor Critic to manage constraints in complex, high-dimensional state spaces effectively. Our method introduces a worst-case actor to guide safe exploration, ensuring rigorous adherence to safety requirements even in unpredictable scenarios. The policy optimization employs the Augmented Lagrangian method and leverages latent diffusion models to predict and simulate future trajectories. This dual approach not only aids in navigating environments safely but also refines the policy's performance by integrating distribution modeling to account for environmental uncertainties. Empirical evaluations conducted in both simulated and real environment demonstrate that our approach outperforms existing methods in terms of safety, efficiency, and decision-making capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06317v4-abstract-full').style.display = 'none'; document.getElementById('2407.06317v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01959">arXiv:2407.01959</a> <span> [<a href="https://arxiv.org/pdf/2407.01959">pdf</a>, <a href="https://arxiv.org/format/2407.01959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FlowTrack: Point-level Flow Network for 3D Single Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuo Li</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yubo Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiheng Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01959v1-abstract-short" style="display: inline;"> 3D single object tracking (SOT) is a crucial task in fields of mobile robotics and autonomous driving. Traditional motion-based approaches achieve target tracking by estimating the relative movement of target between two consecutive frames. However, they usually overlook local motion information of the target and fail to exploit historical frame information effectively. To overcome the above limit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01959v1-abstract-full').style.display = 'inline'; document.getElementById('2407.01959v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01959v1-abstract-full" style="display: none;"> 3D single object tracking (SOT) is a crucial task in fields of mobile robotics and autonomous driving. Traditional motion-based approaches achieve target tracking by estimating the relative movement of target between two consecutive frames. However, they usually overlook local motion information of the target and fail to exploit historical frame information effectively. To overcome the above limitations, we propose a point-level flow method with multi-frame information for 3D SOT task, called FlowTrack. Specifically, by estimating the flow for each point in the target, our method could capture the local motion details of target, thereby improving the tracking performance. At the same time, to handle scenes with sparse points, we present a learnable target feature as the bridge to efficiently integrate target information from past frames. Moreover, we design a novel Instance Flow Head to transform dense point-level flow into instance-level motion, effectively aggregating local motion information to obtain global target motion. Finally, our method achieves competitive performance with improvements of 5.9% on the KITTI dataset and 2.9% on NuScenes. The code will be made publicly available soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01959v1-abstract-full').style.display = 'none'; document.getElementById('2407.01959v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IROS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00952">arXiv:2407.00952</a> <span> [<a href="https://arxiv.org/pdf/2407.00952">pdf</a>, <a href="https://arxiv.org/format/2407.00952">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> SplitLoRA: A Split Parameter-Efficient Fine-Tuning Framework for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zheng Lin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuanjie Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zihan Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Ang Li</a>, <a href="/search/cs?searchtype=author&query=Vepakomma%2C+P">Praneeth Vepakomma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00952v1-abstract-short" style="display: inline;"> The scalability of large language models (LLMs) in handling high-complexity models and large-scale datasets has led to tremendous successes in pivotal domains. While there is an urgent need to acquire more training data for LLMs, a concerning reality is the depletion of high-quality public datasets within a few years. In view of this, the federated learning (FL) LLM fine-tuning paradigm recently h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00952v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00952v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00952v1-abstract-full" style="display: none;"> The scalability of large language models (LLMs) in handling high-complexity models and large-scale datasets has led to tremendous successes in pivotal domains. While there is an urgent need to acquire more training data for LLMs, a concerning reality is the depletion of high-quality public datasets within a few years. In view of this, the federated learning (FL) LLM fine-tuning paradigm recently has been proposed to facilitate collaborative LLM fine-tuning on distributed private data, where multiple data owners collaboratively fine-tune a shared LLM without sharing raw data. However, the staggering model size of LLMs imposes heavy computing and communication burdens on clients, posing significant barriers to the democratization of the FL LLM fine-tuning paradigm. To address this issue, split learning (SL) has emerged as a promising solution by offloading the primary training workload to a server via model partitioning while exchanging activation/activation's gradients with smaller data sizes rather than the entire LLM. Unfortunately, research on the SL LLM fine-tuning paradigm is still in its nascent stage. To fill this gap, in this paper, we propose the first SL LLM fine-tuning framework, named SplitLoRA. SplitLoRA is built on the split federated learning (SFL) framework, amalgamating the advantages of parallel training from FL and model splitting from SL and thus greatly enhancing the training efficiency. It is worth noting that SplitLoRA is the inaugural open-source benchmark for SL LLM fine-tuning, providing a foundation for research efforts dedicated to advancing SL LLM fine-tuning. Extensive simulations validate that SplitLoRA achieves target accuracy in significantly less time than state-of-the-art LLM fine-tuning frameworks, demonstrating the superior training performance of SplitLoRA. The project page is available at https://fduinc.github.io/splitlora/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00952v1-abstract-full').style.display = 'none'; document.getElementById('2407.00952v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19311">arXiv:2406.19311</a> <span> [<a href="https://arxiv.org/pdf/2406.19311">pdf</a>, <a href="https://arxiv.org/format/2406.19311">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Zero-Query Adversarial Attack on Black-box Automatic Speech Recognition Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lingchen Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shenyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bowen Li</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yunjie Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19311v1-abstract-short" style="display: inline;"> In recent years, extensive research has been conducted on the vulnerability of ASR systems, revealing that black-box adversarial example attacks pose significant threats to real-world ASR systems. However, most existing black-box attacks rely on queries to the target ASRs, which is impractical when queries are not permitted. In this paper, we propose ZQ-Attack, a transfer-based adversarial attack… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19311v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19311v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19311v1-abstract-full" style="display: none;"> In recent years, extensive research has been conducted on the vulnerability of ASR systems, revealing that black-box adversarial example attacks pose significant threats to real-world ASR systems. However, most existing black-box attacks rely on queries to the target ASRs, which is impractical when queries are not permitted. In this paper, we propose ZQ-Attack, a transfer-based adversarial attack on ASR systems in the zero-query black-box setting. Through a comprehensive review and categorization of modern ASR technologies, we first meticulously select surrogate ASRs of diverse types to generate adversarial examples. Following this, ZQ-Attack initializes the adversarial perturbation with a scaled target command audio, rendering it relatively imperceptible while maintaining effectiveness. Subsequently, to achieve high transferability of adversarial perturbations, we propose a sequential ensemble optimization algorithm, which iteratively optimizes the adversarial perturbation on each surrogate model, leveraging collaborative information from other models. We conduct extensive experiments to evaluate ZQ-Attack. In the over-the-line setting, ZQ-Attack achieves a 100% success rate of attack (SRoA) with an average signal-to-noise ratio (SNR) of 21.91dB on 4 online speech recognition services, and attains an average SRoA of 100% and SNR of 19.67dB on 16 open-source ASRs. For commercial intelligent voice control devices, ZQ-Attack also achieves a 100% SRoA with an average SNR of 15.77dB in the over-the-air setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19311v1-abstract-full').style.display = 'none'; document.getElementById('2406.19311v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in the Proceedings of The ACM Conference on Computer and Communications Security (CCS), 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Fang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository