Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,611 results for author: <span class="mathjax">Li, G</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Li%2C+G">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, G"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+G&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, G"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+G&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+G&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+G&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12760">arXiv:2412.12760</a> <span> [<a href="https://arxiv.org/pdf/2412.12760">pdf</a>, <a href="https://arxiv.org/format/2412.12760">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CAMEL: Cross-Attention Enhanced Mixture-of-Experts and Language Bias for Code-Switching Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">He Wang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+X">Xucheng Wan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+N">Naijun Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kai Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huan Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guojian Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12760v1-abstract-short" style="display: inline;"> Code-switching automatic speech recognition (ASR) aims to transcribe speech that contains two or more languages accurately. To better capture language-specific speech representations and address language confusion in code-switching ASR, the mixture-of-experts (MoE) architecture and an additional language diarization (LD) decoder are commonly employed. However, most researches remain stagnant in si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12760v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12760v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12760v1-abstract-full" style="display: none;"> Code-switching automatic speech recognition (ASR) aims to transcribe speech that contains two or more languages accurately. To better capture language-specific speech representations and address language confusion in code-switching ASR, the mixture-of-experts (MoE) architecture and an additional language diarization (LD) decoder are commonly employed. However, most researches remain stagnant in simple operations like weighted summation or concatenation to fuse language-specific speech representations, leaving significant opportunities to explore the enhancement of integrating language bias information. In this paper, we introduce CAMEL, a cross-attention-based MoE and language bias approach for code-switching ASR. Specifically, after each MoE layer, we fuse language-specific speech representations with cross-attention, leveraging its strong contextual modeling abilities. Additionally, we design a source attention-based mechanism to incorporate the language information from the LD decoder output into text embeddings. Experimental results demonstrate that our approach achieves state-of-the-art performance on the SEAME, ASRU200, and ASRU700+LibriSpeech460 Mandarin-English code-switching ASR datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12760v1-abstract-full').style.display = 'none'; document.getElementById('2412.12760v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025. 5 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12223">arXiv:2412.12223</a> <span> [<a href="https://arxiv.org/pdf/2412.12223">pdf</a>, <a href="https://arxiv.org/format/2412.12223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can video generation replace cinematographers? Research on the cinematic language of generated video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaozhe Li</a>, <a href="/search/cs?searchtype=author&query=WU%2C+K">Kai WU</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siyi Yang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Y">YiZhan Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guohua. Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhiyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiayao Li</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+J">Jiangchuan Mu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+W">Wen Fang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+M">Mingliang Xiong</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hao Deng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingwen Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bin He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12223v1-abstract-short" style="display: inline;"> Recent advancements in text-to-video (T2V) generation have leveraged diffusion models to enhance the visual coherence of videos generated from textual descriptions. However, most research has primarily focused on object motion, with limited attention given to cinematic language in videos, which is crucial for cinematographers to convey emotion and narrative pacing. To address this limitation, we p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12223v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12223v1-abstract-full" style="display: none;"> Recent advancements in text-to-video (T2V) generation have leveraged diffusion models to enhance the visual coherence of videos generated from textual descriptions. However, most research has primarily focused on object motion, with limited attention given to cinematic language in videos, which is crucial for cinematographers to convey emotion and narrative pacing. To address this limitation, we propose a threefold approach to enhance the ability of T2V models to generate controllable cinematic language. Specifically, we introduce a cinematic language dataset that encompasses shot framing, angle, and camera movement, enabling models to learn diverse cinematic styles. Building on this, to facilitate robust cinematic alignment evaluation, we present CameraCLIP, a model fine-tuned on the proposed dataset that excels in understanding complex cinematic language in generated videos and can further provide valuable guidance in the multi-shot composition process. Finally, we propose CLIPLoRA, a cost-guided dynamic LoRA composition method that facilitates smooth transitions and realistic blending of cinematic language by dynamically fusing multiple pre-trained cinematic LoRAs within a single video. Our experiments demonstrate that CameraCLIP outperforms existing models in assessing the alignment between cinematic language and video, achieving an R@1 score of 0.81. Additionally, CLIPLoRA improves the ability for multi-shot composition, potentially bridging the gap between automatically generated videos and those shot by professional cinematographers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12223v1-abstract-full').style.display = 'none'; document.getElementById('2412.12223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11680">arXiv:2412.11680</a> <span> [<a href="https://arxiv.org/pdf/2412.11680">pdf</a>, <a href="https://arxiv.org/format/2412.11680">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EGP3D: Edge-guided Geometric Preserving 3D Point Cloud Super-resolution for RGB-D camera </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+K">Ke Ye</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yaofang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gongzhe Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xianhong Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruxin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qilin Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11680v1-abstract-short" style="display: inline;"> Point clouds or depth images captured by current RGB-D cameras often suffer from low resolution, rendering them insufficient for applications such as 3D reconstruction and robots. Existing point cloud super-resolution (PCSR) methods are either constrained by geometric artifacts or lack attention to edge details. To address these issues, we propose an edge-guided geometric-preserving 3D point cloud… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11680v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11680v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11680v1-abstract-full" style="display: none;"> Point clouds or depth images captured by current RGB-D cameras often suffer from low resolution, rendering them insufficient for applications such as 3D reconstruction and robots. Existing point cloud super-resolution (PCSR) methods are either constrained by geometric artifacts or lack attention to edge details. To address these issues, we propose an edge-guided geometric-preserving 3D point cloud super-resolution (EGP3D) method tailored for RGB-D cameras. Our approach innovatively optimizes the point cloud with an edge constraint on a projected 2D space, thereby ensuring high-quality edge preservation in the 3D PCSR task. To tackle geometric optimization challenges in super-resolution point clouds, particularly preserving edge shapes and smoothness, we introduce a multi-faceted loss function that simultaneously optimizes the Chamfer distance, Hausdorff distance, and gradient smoothness. Existing datasets used for point cloud upsampling are predominantly synthetic and inadequately represent real-world scenarios, neglecting noise and stray light effects. To address the scarcity of realistic RGB-D data for PCSR tasks, we built a dataset that captures real-world noise and stray-light effects, offering a more accurate representation of authentic environments. Validated through simulations and real-world experiments, the proposed method exhibited superior performance in preserving edge clarity and geometric details. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11680v1-abstract-full').style.display = 'none'; document.getElementById('2412.11680v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11464">arXiv:2412.11464</a> <span> [<a href="https://arxiv.org/pdf/2412.11464">pdf</a>, <a href="https://arxiv.org/format/2412.11464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MaskCLIP++: A Mask-Based CLIP Fine-tuning Framework for Open-Vocabulary Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Quan-Sheng Zeng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Daquan Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanbin Li</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Q">Qibin Hou</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Ming-Ming Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11464v1-abstract-short" style="display: inline;"> Open-vocabulary image segmentation has been advanced through the synergy between mask generators and vision-language models like Contrastive Language-Image Pre-training (CLIP). Previous approaches focus on generating masks while aligning mask features with text embeddings during training. In this paper, we observe that relying on generated low-quality masks can weaken the alignment of vision and l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11464v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11464v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11464v1-abstract-full" style="display: none;"> Open-vocabulary image segmentation has been advanced through the synergy between mask generators and vision-language models like Contrastive Language-Image Pre-training (CLIP). Previous approaches focus on generating masks while aligning mask features with text embeddings during training. In this paper, we observe that relying on generated low-quality masks can weaken the alignment of vision and language in regional representations. This motivates us to present a new fine-tuning framework, named MaskCLIP++, which uses ground-truth masks instead of generated masks to enhance the mask classification capability of CLIP. Due to the limited diversity of image segmentation datasets with mask annotations, we propose incorporating a consistency alignment constraint during fine-tuning, which alleviates categorical bias toward the fine-tuning dataset. After low-cost fine-tuning, combining with the mask generator in previous state-of-the-art mask-based open vocabulary segmentation methods, we achieve performance improvements of +1.7, +2.3, +2.1, +3.1, and +0.3 mIoU on the A-847, PC-459, A-150, PC-59, and PAS-20 datasets, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11464v1-abstract-full').style.display = 'none'; document.getElementById('2412.11464v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10483">arXiv:2412.10483</a> <span> [<a href="https://arxiv.org/pdf/2412.10483">pdf</a>, <a href="https://arxiv.org/format/2412.10483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Automated Loop Invariant Generation for Complex Programs with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruibang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqiang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Minyu Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Ling-I Wu</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+J">Jingyu Ke</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10483v1-abstract-short" style="display: inline;"> Automated program verification has always been an important component of building trustworthy software. While the analysis of real-world programs remains a theoretical challenge, the automation of loop invariant analysis has effectively resolved the problem. However, real-world programs that often mix complex data structures and control flows pose challenges to traditional loop invariant generatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10483v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10483v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10483v1-abstract-full" style="display: none;"> Automated program verification has always been an important component of building trustworthy software. While the analysis of real-world programs remains a theoretical challenge, the automation of loop invariant analysis has effectively resolved the problem. However, real-world programs that often mix complex data structures and control flows pose challenges to traditional loop invariant generation tools. To enhance the applicability of invariant generation techniques, we proposed ACInv, an Automated Complex program loop Invariant generation tool, which combines static analysis with Large Language Models (LLMs) to generate the proper loop invariants. We utilize static analysis to extract the necessary information for each loop and embed it into prompts for the LLM to generate invariants for each loop. Subsequently, we employ an LLM-based evaluator to assess the generated invariants, refining them by either strengthening, weakening, or rejecting them based on their correctness, ultimately obtaining enhanced invariants. We conducted experiments on ACInv, which showed that ACInv outperformed previous tools on data sets with data structures, and maintained similar performance to the state-of-the-art tool AutoSpec on numerical programs without data structures. For the total data set, ACInv can solve 21% more examples than AutoSpec and can generate reference data structure templates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10483v1-abstract-full').style.display = 'none'; document.getElementById('2412.10483v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09362">arXiv:2412.09362</a> <span> [<a href="https://arxiv.org/pdf/2412.09362">pdf</a>, <a href="https://arxiv.org/format/2412.09362">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Falcon-UI: Understanding GUI Before Following User Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+H">Huawen Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gengluo Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinlong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Can Ma</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09362v1-abstract-short" style="display: inline;"> Pursuing human-like interaction for Graphical User Interface (GUI) agents requires understanding the GUI context and following user instructions. However, existing works typically couple these two aspects and focus more on instruct-following abilities, while ignoring the importance of understanding the GUI context. In this paper, we introduce an instruction-free GUI navigation dataset, termed Insi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09362v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09362v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09362v1-abstract-full" style="display: none;"> Pursuing human-like interaction for Graphical User Interface (GUI) agents requires understanding the GUI context and following user instructions. However, existing works typically couple these two aspects and focus more on instruct-following abilities, while ignoring the importance of understanding the GUI context. In this paper, we introduce an instruction-free GUI navigation dataset, termed Insight-UI Dataset, to enhance model comprehension of GUI environments. Insight-UI Dataset is automatically generated from the Common Crawl corpus, simulating various platforms -- including iOS, Android, Windows, and Linux -- across multiple resolutions on 312K domains. Although GUI interactions vary by context, diverse interfaces share common internal patterns, such as clicking an item to view its details. It implies the feasibility of independent GUI operation learning, followed by joint optimization with instruction tuning. Thereby, we develop the GUI agent model Falcon-UI, which is initially pretrained on Insight-UI Dataset and subsequently fine-tuned on Android and Web GUI datasets, including AITW, AITZ, Android Control, and Mind2Web. With 7 billion parameters, Falcon-UI achieves accuracy comparable to the 72 billion-parameter Qwen2VL on AITZ, validating the alignment between GUI context comprehension and agent performance. Our code and dataset will be open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09362v1-abstract-full').style.display = 'none'; document.getElementById('2412.09362v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09082">arXiv:2412.09082</a> <span> [<a href="https://arxiv.org/pdf/2412.09082">pdf</a>, <a href="https://arxiv.org/format/2412.09082">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Long-Horizon Vision-Language Navigation: Platform, Benchmark and Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+X">Xinshuai Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weixing Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weikai Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanbin Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Liang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09082v1-abstract-short" style="display: inline;"> Existing Vision-Language Navigation (VLN) methods primarily focus on single-stage navigation, limiting their effectiveness in multi-stage and long-horizon tasks within complex and dynamic environments. To address these limitations, we propose a novel VLN task, named Long-Horizon Vision-Language Navigation (LH-VLN), which emphasizes long-term planning and decision consistency across consecutive sub… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09082v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09082v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09082v1-abstract-full" style="display: none;"> Existing Vision-Language Navigation (VLN) methods primarily focus on single-stage navigation, limiting their effectiveness in multi-stage and long-horizon tasks within complex and dynamic environments. To address these limitations, we propose a novel VLN task, named Long-Horizon Vision-Language Navigation (LH-VLN), which emphasizes long-term planning and decision consistency across consecutive subtasks. Furthermore, to support LH-VLN, we develop an automated data generation platform NavGen, which constructs datasets with complex task structures and improves data utility through a bidirectional, multi-granularity generation approach. To accurately evaluate complex tasks, we construct the Long-Horizon Planning and Reasoning in VLN (LHPR-VLN) benchmark consisting of 3,260 tasks with an average of 150 task steps, serving as the first dataset specifically designed for the long-horizon vision-language navigation task. Furthermore, we propose Independent Success Rate (ISR), Conditional Success Rate (CSR), and CSR weight by Ground Truth (CGT) metrics, to provide fine-grained assessments of task completion. To improve model adaptability in complex tasks, we propose a novel Multi-Granularity Dynamic Memory (MGDM) module that integrates short-term memory blurring with long-term memory retrieval to enable flexible navigation in dynamic environments. Our platform, benchmark and method supply LH-VLN with a robust data generation pipeline, comprehensive model evaluation dataset, reasonable metrics, and a novel VLN model, establishing a foundational framework for advancing LH-VLN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09082v1-abstract-full').style.display = 'none'; document.getElementById('2412.09082v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A novel Vision-Language Navigation task: Long-Horizon Vision-Language Navigation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09050">arXiv:2412.09050</a> <span> [<a href="https://arxiv.org/pdf/2412.09050">pdf</a>, <a href="https://arxiv.org/format/2412.09050">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ContextHOI: Spatial Context Learning for Human-Object Interaction Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+M">Mingda Jia</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Liming Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yun Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09050v1-abstract-short" style="display: inline;"> Spatial contexts, such as the backgrounds and surroundings, are considered critical in Human-Object Interaction (HOI) recognition, especially when the instance-centric foreground is blurred or occluded. Recent advancements in HOI detectors are usually built upon detection transformer pipelines. While such an object-detection-oriented paradigm shows promise in localizing objects, its exploration of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09050v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09050v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09050v1-abstract-full" style="display: none;"> Spatial contexts, such as the backgrounds and surroundings, are considered critical in Human-Object Interaction (HOI) recognition, especially when the instance-centric foreground is blurred or occluded. Recent advancements in HOI detectors are usually built upon detection transformer pipelines. While such an object-detection-oriented paradigm shows promise in localizing objects, its exploration of spatial context is often insufficient for accurately recognizing human actions. To enhance the capabilities of object detectors for HOI detection, we present a dual-branch framework named ContextHOI, which efficiently captures both object detection features and spatial contexts. In the context branch, we train the model to extract informative spatial context without requiring additional hand-craft background labels. Furthermore, we introduce context-aware spatial and semantic supervision to the context branch to filter out irrelevant noise and capture informative contexts. ContextHOI achieves state-of-the-art performance on the HICO-DET and v-coco benchmarks. For further validation, we construct a novel benchmark, HICO-ambiguous, which is a subset of HICO-DET that contains images with occluded or impaired instance cues. Extensive experiments across all benchmarks, complemented by visualizations, underscore the enhancements provided by ContextHOI, especially in recognizing interactions involving occluded or blurred instances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09050v1-abstract-full').style.display = 'none'; document.getElementById('2412.09050v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">in proceedings of the 39th AAAI Conference on Artificial Intelligence (AAAI-25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08907">arXiv:2412.08907</a> <span> [<a href="https://arxiv.org/pdf/2412.08907">pdf</a>, <a href="https://arxiv.org/format/2412.08907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GaGA: Towards Interactive Global Geolocation Assistant </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xumeng Han</a>, <a href="/search/cs?searchtype=author&query=Qiang%2C+C">Chenhui Qiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kuiran Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guorong Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhibei Huang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhenjun Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08907v1-abstract-short" style="display: inline;"> Global geolocation, which seeks to predict the geographical location of images captured anywhere in the world, is one of the most challenging tasks in the field of computer vision. In this paper, we introduce an innovative interactive global geolocation assistant named GaGA, built upon the flourishing large vision-language models (LVLMs). GaGA uncovers geographical clues within images and combines… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08907v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08907v1-abstract-full" style="display: none;"> Global geolocation, which seeks to predict the geographical location of images captured anywhere in the world, is one of the most challenging tasks in the field of computer vision. In this paper, we introduce an innovative interactive global geolocation assistant named GaGA, built upon the flourishing large vision-language models (LVLMs). GaGA uncovers geographical clues within images and combines them with the extensive world knowledge embedded in LVLMs to determine the geolocations while also providing justifications and explanations for the prediction results. We further designed a novel interactive geolocation method that surpasses traditional static inference approaches. It allows users to intervene, correct, or provide clues for the predictions, making the model more flexible and practical. The development of GaGA relies on the newly proposed Multi-modal Global Geolocation (MG-Geo) dataset, a comprehensive collection of 5 million high-quality image-text pairs. GaGA achieves state-of-the-art performance on the GWS15k dataset, improving accuracy by 4.57% at the country level and 2.92% at the city level, setting a new benchmark. These advancements represent a significant leap forward in developing highly accurate, interactive geolocation systems with global applicability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08907v1-abstract-full').style.display = 'none'; document.getElementById('2412.08907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08604">arXiv:2412.08604</a> <span> [<a href="https://arxiv.org/pdf/2412.08604">pdf</a>, <a href="https://arxiv.org/format/2412.08604">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Preference Discerning with LLM-Enhanced Generative Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Paischer%2C+F">Fabian Paischer</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liu Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Linfeng Liu</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+S">Shuai Shao</a>, <a href="/search/cs?searchtype=author&query=Hassani%2C+K">Kaveh Hassani</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiacheng Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ricky Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z+G">Zhang Gabriel Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xialo Gao</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wei Shao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xue Feng</a>, <a href="/search/cs?searchtype=author&query=Noorshams%2C+N">Nima Noorshams</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sem Park</a>, <a href="/search/cs?searchtype=author&query=Long%2C+B">Bo Long</a>, <a href="/search/cs?searchtype=author&query=Eghbalzadeh%2C+H">Hamid Eghbalzadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08604v1-abstract-short" style="display: inline;"> Sequential recommendation systems aim to provide personalized recommendations for users based on their interaction history. To achieve this, they often incorporate auxiliary information, such as textual descriptions of items and auxiliary tasks, like predicting user preferences and intent. Despite numerous efforts to enhance these models, they still suffer from limited personalization. To address… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08604v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08604v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08604v1-abstract-full" style="display: none;"> Sequential recommendation systems aim to provide personalized recommendations for users based on their interaction history. To achieve this, they often incorporate auxiliary information, such as textual descriptions of items and auxiliary tasks, like predicting user preferences and intent. Despite numerous efforts to enhance these models, they still suffer from limited personalization. To address this issue, we propose a new paradigm, which we term preference discerning. In preference dscerning, we explicitly condition a generative sequential recommendation system on user preferences within its context. To this end, we generate user preferences using Large Language Models (LLMs) based on user reviews and item-specific data. To evaluate preference discerning capabilities of sequential recommendation systems, we introduce a novel benchmark that provides a holistic evaluation across various scenarios, including preference steering and sentiment following. We assess current state-of-the-art methods using our benchmark and show that they struggle to accurately discern user preferences. Therefore, we propose a new method named Mender ($\textbf{M}$ultimodal Prefer$\textbf{en}$ce $\textbf{d}$iscern$\textbf{er}$), which improves upon existing methods and achieves state-of-the-art performance on our benchmark. Our results show that Mender can be effectively guided by human preferences even though they have not been observed during training, paving the way toward more personalized sequential recommendation systems. We will open-source the code and benchmarks upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08604v1-abstract-full').style.display = 'none'; document.getElementById('2412.08604v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages + references and appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08506">arXiv:2412.08506</a> <span> [<a href="https://arxiv.org/pdf/2412.08506">pdf</a>, <a href="https://arxiv.org/format/2412.08506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Orchestrating the Symphony of Prompt Distribution Learning for Human-Object Interaction Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+M">Mingda Jia</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Liming Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yun Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08506v1-abstract-short" style="display: inline;"> Human-object interaction (HOI) detectors with popular query-transformer architecture have achieved promising performance. However, accurately identifying uncommon visual patterns and distinguishing between ambiguous HOIs continue to be difficult for them. We observe that these difficulties may arise from the limited capacity of traditional detector queries in representing diverse intra-category pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08506v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08506v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08506v1-abstract-full" style="display: none;"> Human-object interaction (HOI) detectors with popular query-transformer architecture have achieved promising performance. However, accurately identifying uncommon visual patterns and distinguishing between ambiguous HOIs continue to be difficult for them. We observe that these difficulties may arise from the limited capacity of traditional detector queries in representing diverse intra-category patterns and inter-category dependencies. To address this, we introduce the Interaction Prompt Distribution Learning (InterProDa) approach. InterProDa learns multiple sets of soft prompts and estimates category distributions from various prompts. It then incorporates HOI queries with category distributions, making them capable of representing near-infinite intra-category dynamics and universal cross-category relationships. Our InterProDa detector demonstrates competitive performance on HICO-DET and vcoco benchmarks. Additionally, our method can be integrated into most transformer-based HOI detectors, significantly enhancing their performance with minimal additional parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08506v1-abstract-full').style.display = 'none'; document.getElementById('2412.08506v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">in Proceedings of the 39th AAAI Conference on Artificial Intelligence (AAAI-25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07481">arXiv:2412.07481</a> <span> [<a href="https://arxiv.org/pdf/2412.07481">pdf</a>, <a href="https://arxiv.org/format/2412.07481">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Manta: Enhancing Mamba for Few-Shot Action Recognition of Long Sub-Sequence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenbo Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jinghui Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuoyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+F">Fang Dong</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jiahui Jin</a>, <a href="/search/cs?searchtype=author&query=Ogawa%2C+T">Takahiro Ogawa</a>, <a href="/search/cs?searchtype=author&query=Haseyama%2C+M">Miki Haseyama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07481v1-abstract-short" style="display: inline;"> In few-shot action recognition~(FSAR), long sub-sequences of video naturally express entire actions more effectively. However, the computational complexity of mainstream Transformer-based methods limits their application. Recent Mamba demonstrates efficiency in modeling long sequences, but directly applying Mamba to FSAR overlooks the importance of local feature modeling and alignment. Moreover, l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07481v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07481v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07481v1-abstract-full" style="display: none;"> In few-shot action recognition~(FSAR), long sub-sequences of video naturally express entire actions more effectively. However, the computational complexity of mainstream Transformer-based methods limits their application. Recent Mamba demonstrates efficiency in modeling long sequences, but directly applying Mamba to FSAR overlooks the importance of local feature modeling and alignment. Moreover, long sub-sequences within the same class accumulate intra-class variance, which adversely impacts FSAR performance. To solve these challenges, we propose a \underline{\textbf{M}}atryoshka M\underline{\textbf{A}}mba and Co\underline{\textbf{N}}tras\underline{\textbf{T}}ive Le\underline{\textbf{A}}rning framework~(\textbf{Manta}). Firstly, the Matryoshka Mamba introduces multiple Inner Modules to enhance local feature representation, rather than directly modeling global features. An Outer Module captures dependencies of timeline between these local features for implicit temporal alignment. Secondly, a hybrid contrastive learning paradigm, combining both supervised and unsupervised methods, is designed to mitigate the negative effects of intra-class variance accumulation. The Matryoshka Mamba and the hybrid contrastive learning paradigm operate in parallel branches within Manta, enhancing Mamba for FSAR of long sub-sequence. Manta achieves new state-of-the-art performance on prominent benchmarks, including SSv2, Kinetics, UCF101, and HMDB51. Extensive empirical studies prove that Manta significantly improves FSAR of long sub-sequence from multiple perspectives. The code is released at https://github.com/wenbohuang1002/Manta. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07481v1-abstract-full').style.display = 'none'; document.getElementById('2412.07481v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07360">arXiv:2412.07360</a> <span> [<a href="https://arxiv.org/pdf/2412.07360">pdf</a>, <a href="https://arxiv.org/format/2412.07360">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient 3D Recognition with Event-driven Spike Sparse Convolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xuerui Qiu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+M">Man Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jieyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+Y">Yuhong Chou</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+N">Ning Qiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shibo Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bo Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07360v1-abstract-short" style="display: inline;"> Spiking Neural Networks (SNNs) provide an energy-efficient way to extract 3D spatio-temporal features. Point clouds are sparse 3D spatial data, which suggests that SNNs should be well-suited for processing them. However, when applying SNNs to point clouds, they often exhibit limited performance and fewer application scenarios. We attribute this to inappropriate preprocessing and feature extraction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07360v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07360v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07360v1-abstract-full" style="display: none;"> Spiking Neural Networks (SNNs) provide an energy-efficient way to extract 3D spatio-temporal features. Point clouds are sparse 3D spatial data, which suggests that SNNs should be well-suited for processing them. However, when applying SNNs to point clouds, they often exhibit limited performance and fewer application scenarios. We attribute this to inappropriate preprocessing and feature extraction methods. To address this issue, we first introduce the Spike Voxel Coding (SVC) scheme, which encodes the 3D point clouds into a sparse spike train space, reducing the storage requirements and saving time on point cloud preprocessing. Then, we propose a Spike Sparse Convolution (SSC) model for efficiently extracting 3D sparse point cloud features. Combining SVC and SSC, we design an efficient 3D SNN backbone (E-3DSNN), which is friendly with neuromorphic hardware. For instance, SSC can be implemented on neuromorphic chips with only minor modifications to the addressing function of vanilla spike convolution. Experiments on ModelNet40, KITTI, and Semantic KITTI datasets demonstrate that E-3DSNN achieves state-of-the-art (SOTA) results with remarkable efficiency. Notably, our E-3DSNN (1.87M) obtained 91.7\% top-1 accuracy on ModelNet40, surpassing the current best SNN baselines (14.3M) by 3.0\%. To our best knowledge, it is the first direct training 3D SNN backbone that can simultaneously handle various 3D computer vision tasks (e.g., classification, detection, and segmentation) with an event-driven nature. Code is available: https://github.com/bollossom/E-3DSNN/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07360v1-abstract-full').style.display = 'none'; document.getElementById('2412.07360v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06666">arXiv:2412.06666</a> <span> [<a href="https://arxiv.org/pdf/2412.06666">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Diff5T: Benchmarking Human Brain Diffusion MRI with an Extensive 5.0 Tesla K-Space and Spatial Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shanshan Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shoujun Yu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jian Cheng</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Sen Jia</a>, <a href="/search/cs?searchtype=author&query=Tie%2C+C">Changjun Tie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiayu Zhu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Haohao Peng</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yijing Dong</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jianzhong He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Y">Yaowen Xing</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiuqin Jia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qi Yang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qiyuan Tian</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hua Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guobin Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hairong Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06666v1-abstract-short" style="display: inline;"> Diffusion magnetic resonance imaging (dMRI) provides critical insights into the microstructural and connectional organization of the human brain. However, the availability of high-field, open-access datasets that include raw k-space data for advanced research remains limited. To address this gap, we introduce Diff5T, a first comprehensive 5.0 Tesla diffusion MRI dataset focusing on the human brain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06666v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06666v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06666v1-abstract-full" style="display: none;"> Diffusion magnetic resonance imaging (dMRI) provides critical insights into the microstructural and connectional organization of the human brain. However, the availability of high-field, open-access datasets that include raw k-space data for advanced research remains limited. To address this gap, we introduce Diff5T, a first comprehensive 5.0 Tesla diffusion MRI dataset focusing on the human brain. This dataset includes raw k-space data and reconstructed diffusion images, acquired using a variety of imaging protocols. Diff5T is designed to support the development and benchmarking of innovative methods in artifact correction, image reconstruction, image preprocessing, diffusion modelling and tractography. The dataset features a wide range of diffusion parameters, including multiple b-values and gradient directions, allowing extensive research applications in studying human brain microstructure and connectivity. With its emphasis on open accessibility and detailed benchmarks, Diff5T serves as a valuable resource for advancing human brain mapping research using diffusion MRI, fostering reproducibility, and enabling collaboration across the neuroscience and medical imaging communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06666v1-abstract-full').style.display = 'none'; document.getElementById('2412.06666v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 4 figures, 1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06355">arXiv:2412.06355</a> <span> [<a href="https://arxiv.org/pdf/2412.06355">pdf</a>, <a href="https://arxiv.org/format/2412.06355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Flexible and Scalable Deep Dendritic Spiking Neural Networks with Multiple Nonlinear Branching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yifan Huang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+W">Wei Fang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhengyu Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yonghong Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06355v1-abstract-short" style="display: inline;"> Recent advances in spiking neural networks (SNNs) have a predominant focus on network architectures, while relatively little attention has been paid to the underlying neuron model. The point neuron models, a cornerstone of deep SNNs, pose a bottleneck on the network-level expressivity since they depict somatic dynamics only. In contrast, the multi-compartment models in neuroscience offer remarkabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06355v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06355v1-abstract-full" style="display: none;"> Recent advances in spiking neural networks (SNNs) have a predominant focus on network architectures, while relatively little attention has been paid to the underlying neuron model. The point neuron models, a cornerstone of deep SNNs, pose a bottleneck on the network-level expressivity since they depict somatic dynamics only. In contrast, the multi-compartment models in neuroscience offer remarkable expressivity by introducing dendritic morphology and dynamics, but remain underexplored in deep learning due to their unaffordable computational cost and inflexibility. To combine the advantages of both sides for a flexible, efficient yet more powerful model, we propose the dendritic spiking neuron (DendSN) incorporating multiple dendritic branches with nonlinear dynamics. Compared to the point spiking neurons, DendSN exhibits significantly higher expressivity. DendSN's flexibility enables its seamless integration into diverse deep SNN architectures. To accelerate dendritic SNNs (DendSNNs), we parallelize dendritic state updates across time steps, and develop Triton kernels for GPU-level acceleration. As a result, we can construct large-scale DendSNNs with depth comparable to their point SNN counterparts. Next, we comprehensively evaluate DendSNNs' performance on various demanding tasks. By modulating dendritic branch strengths using a context signal, catastrophic forgetting of DendSNNs is substantially mitigated. Moreover, DendSNNs demonstrate enhanced robustness against noise and adversarial attacks compared to point SNNs, and excel in few-shot learning settings. Our work firstly demonstrates the possibility of training bio-plausible dendritic SNNs with depths and scales comparable to traditional point SNNs, and reveals superior expressivity and robustness of reduced dendritic neuron models in deep learning, thereby offering a fresh perspective on advancing neural network design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06355v1-abstract-full').style.display = 'none'; document.getElementById('2412.06355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05647">arXiv:2412.05647</a> <span> [<a href="https://arxiv.org/pdf/2412.05647">pdf</a>, <a href="https://arxiv.org/ps/2412.05647">ps</a>, <a href="https://arxiv.org/format/2412.05647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Deep Reinforcement Learning-Based Resource Allocation for Hybrid Bit and Generative Semantic Communications in Space-Air-Ground Integrated Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chong Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuyang Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gaojie Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">Pei Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05647v1-abstract-short" style="display: inline;"> In this paper, we introduce a novel framework consisting of hybrid bit-level and generative semantic communications for efficient downlink image transmission within space-air-ground integrated networks (SAGINs). The proposed model comprises multiple low Earth orbit (LEO) satellites, unmanned aerial vehicles (UAVs), and ground users. Considering the limitations in signal coverage and receiver anten… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05647v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05647v1-abstract-full" style="display: none;"> In this paper, we introduce a novel framework consisting of hybrid bit-level and generative semantic communications for efficient downlink image transmission within space-air-ground integrated networks (SAGINs). The proposed model comprises multiple low Earth orbit (LEO) satellites, unmanned aerial vehicles (UAVs), and ground users. Considering the limitations in signal coverage and receiver antennas that make the direct communication between satellites and ground users unfeasible in many scenarios, thus UAVs serve as relays and forward images from satellites to the ground users. Our hybrid communication framework effectively combines bit-level transmission with several semantic-level image generation modes, optimizing bandwidth usage to meet stringent satellite link budget constraints and ensure communication reliability and low latency under low signal-to-noise ratio (SNR) conditions. To reduce the transmission delay while ensuring the reconstruction quality at the ground user, we propose a novel metric for measuring delay and reconstruction quality in the proposed system, and employ a deep reinforcement learning (DRL)-based strategy to optimize the resource in the proposed network. Simulation results demonstrate the superiority of the proposed framework in terms of communication resource conservation, reduced latency, and maintaining high image quality, significantly outperforming traditional solutions. Therefore, the proposed framework can ensure that real-time image transmission requirements in SAGINs, even under dynamic network conditions and user demand. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05647v1-abstract-full').style.display = 'none'; document.getElementById('2412.05647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05302">arXiv:2412.05302</a> <span> [<a href="https://arxiv.org/pdf/2412.05302">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A High Energy-Efficiency Multi-core Neuromorphic Architecture for Deep SNN Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingjing Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huihui Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaofeng Xu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zhiwei Zhong</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+P">Puli Quan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xueke Zhu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yanyu Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wenjie Lin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongyu Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junchao Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yunhao Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhengyu Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+X">Xiaoxin Cui</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yonghong Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05302v2-abstract-short" style="display: inline;"> There is a growing necessity for edge training to adapt to dynamically changing environment. Neuromorphic computing represents a significant pathway for high-efficiency intelligent computation in energy-constrained edges, but existing neuromorphic architectures lack the ability of directly training spiking neural networks (SNNs) based on backpropagation. We develop a multi-core neuromorphic archit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05302v2-abstract-full').style.display = 'inline'; document.getElementById('2412.05302v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05302v2-abstract-full" style="display: none;"> There is a growing necessity for edge training to adapt to dynamically changing environment. Neuromorphic computing represents a significant pathway for high-efficiency intelligent computation in energy-constrained edges, but existing neuromorphic architectures lack the ability of directly training spiking neural networks (SNNs) based on backpropagation. We develop a multi-core neuromorphic architecture with Feedforward-Propagation, Back-Propagation, and Weight-Gradient engines in each core, supporting high efficient parallel computing at both the engine and core levels. It combines various data flows and sparse computation optimization by fully leveraging the sparsity in SNN training, obtaining a high energy efficiency of 1.05TFLOPS/W@ FP16 @ 28nm, 55 ~ 85% reduction of DRAM access compared to A100 GPU in SNN trainings, and a 20-core deep SNN training and a 5-worker federated learning on FPGAs. Our study develops the first multi-core neuromorphic architecture supporting the direct SNN training, facilitating the neuromorphic computing in edge-learnable applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05302v2-abstract-full').style.display = 'none'; document.getElementById('2412.05302v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04866">arXiv:2412.04866</a> <span> [<a href="https://arxiv.org/pdf/2412.04866">pdf</a>, <a href="https://arxiv.org/ps/2412.04866">ps</a>, <a href="https://arxiv.org/format/2412.04866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Near-field Communications with Extremely Large-Scale Uniform Arc Arrays: Channel Modelling and Performance Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoyu Li</a>, <a href="/search/cs?searchtype=author&query=You%2C+C">Changsheng You</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+G">Guanyu Shang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shaochuan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04866v1-abstract-short" style="display: inline;"> In this letter, we propose a new conformal array architecture, called extremely large-scale uniform arc array (XL-UAA), to improve near-field communication performance. Specifically,under the non-uniform spherical wavefront channel model, we establish mathematical modeling and performance analysis for XL-UAAs. It is shown that XL-UAAs have larger direction-dependent Rayleigh distance and uniform p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04866v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04866v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04866v1-abstract-full" style="display: none;"> In this letter, we propose a new conformal array architecture, called extremely large-scale uniform arc array (XL-UAA), to improve near-field communication performance. Specifically,under the non-uniform spherical wavefront channel model, we establish mathematical modeling and performance analysis for XL-UAAs. It is shown that XL-UAAs have larger direction-dependent Rayleigh distance and uniform power distance than the conventional XL uniform linear arrays (XL-ULAs). Moreover, a closed-form expression for the signal-to-noise ratio (SNR) is obtained, which depends on collective properties of XL-UAAs, such as the distance between the user and the array center,as well as the arc radius. In addition, we show that the asymptotic SNR of XL-UAAs with the number of antennas depends on the projection distance of the user to the middle of the arc array. Finally, numerical results verify that XL-UAAs achieve a higher SNR than XL-ULAs, especially at larger user incident angles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04866v1-abstract-full').style.display = 'none'; document.getElementById('2412.04866v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03121">arXiv:2412.03121</a> <span> [<a href="https://arxiv.org/pdf/2412.03121">pdf</a>, <a href="https://arxiv.org/format/2412.03121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Splats in Splats: Embedding Invisible 3D Watermark within Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yijia Guo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenkai Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gaolei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hang Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Liwen Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianhua Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tiejun Huang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lei Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03121v1-abstract-short" style="display: inline;"> 3D Gaussian splatting (3DGS) has demonstrated impressive 3D reconstruction performance with explicit scene representations. Given the widespread application of 3DGS in 3D reconstruction and generation tasks, there is an urgent need to protect the copyright of 3DGS assets. However, existing copyright protection techniques for 3DGS overlook the usability of 3D assets, posing challenges for practical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03121v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03121v1-abstract-full" style="display: none;"> 3D Gaussian splatting (3DGS) has demonstrated impressive 3D reconstruction performance with explicit scene representations. Given the widespread application of 3DGS in 3D reconstruction and generation tasks, there is an urgent need to protect the copyright of 3DGS assets. However, existing copyright protection techniques for 3DGS overlook the usability of 3D assets, posing challenges for practical deployment. Here we describe WaterGS, the first 3DGS watermarking framework that embeds 3D content in 3DGS itself without modifying any attributes of the vanilla 3DGS. To achieve this, we take a deep insight into spherical harmonics (SH) and devise an importance-graded SH coefficient encryption strategy to embed the hidden SH coefficients. Furthermore, we employ a convolutional autoencoder to establish a mapping between the original Gaussian primitives' opacity and the hidden Gaussian primitives' opacity. Extensive experiments indicate that WaterGS significantly outperforms existing 3D steganography techniques, with 5.31% higher scene fidelity and 3X faster rendering speed, while ensuring security, robustness, and user experience. Codes and data will be released at https://water-gs.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03121v1-abstract-full').style.display = 'none'; document.getElementById('2412.03121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02090">arXiv:2412.02090</a> <span> [<a href="https://arxiv.org/pdf/2412.02090">pdf</a>, <a href="https://arxiv.org/format/2412.02090">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> </div> </div> <p class="title is-5 mathjax"> MEP-Net: Generating Solutions to Scientific Problems with Limited Knowledge by Maximum Entropy Principle </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wuyue Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+L">Liangrong Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guojie Li</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liu Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02090v1-abstract-short" style="display: inline;"> Maximum entropy principle (MEP) offers an effective and unbiased approach to inferring unknown probability distributions when faced with incomplete information, while neural networks provide the flexibility to learn complex distributions from data. This paper proposes a novel neural network architecture, the MEP-Net, which combines the MEP with neural networks to generate probability distributions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02090v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02090v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02090v1-abstract-full" style="display: none;"> Maximum entropy principle (MEP) offers an effective and unbiased approach to inferring unknown probability distributions when faced with incomplete information, while neural networks provide the flexibility to learn complex distributions from data. This paper proposes a novel neural network architecture, the MEP-Net, which combines the MEP with neural networks to generate probability distributions from moment constraints. We also provide a comprehensive overview of the fundamentals of the maximum entropy principle, its mathematical formulations, and a rigorous justification for its applicability for non-equilibrium systems based on the large deviations principle. Through fruitful numerical experiments, we demonstrate that the MEP-Net can be particularly useful in modeling the evolution of probability distributions in biochemical reaction networks and in generating complex distributions from data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02090v1-abstract-full').style.display = 'none'; document.getElementById('2412.02090v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 6 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01661">arXiv:2412.01661</a> <span> [<a href="https://arxiv.org/pdf/2412.01661">pdf</a>, <a href="https://arxiv.org/format/2412.01661">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> R-Bot: An LLM-based Query Rewrite System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhaoyan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xuanhe Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoliang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01661v1-abstract-short" style="display: inline;"> Query rewrite is essential for optimizing SQL queries to improve their execution efficiency without changing their results. Traditionally, this task has been tackled through heuristic and learning-based methods, each with its limitations in terms of inferior quality and low robustness. Recent advancements in LLMs offer a new paradigm by leveraging their superior natural language and code comprehen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01661v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01661v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01661v1-abstract-full" style="display: none;"> Query rewrite is essential for optimizing SQL queries to improve their execution efficiency without changing their results. Traditionally, this task has been tackled through heuristic and learning-based methods, each with its limitations in terms of inferior quality and low robustness. Recent advancements in LLMs offer a new paradigm by leveraging their superior natural language and code comprehension abilities. Despite their potential, directly applying LLMs like GPT-4 has faced challenges due to problems such as hallucinations, where the model might generate inaccurate or irrelevant results. To address this, we propose R-Bot, an LLM-based query rewrite system with a systematic approach. We first design a multi-source rewrite evidence preparation pipeline to generate query rewrite evidences for guiding LLMs to avoid hallucinations. We then propose a hybrid structure-semantics retrieval method that combines structural and semantic analysis to retrieve the most relevant rewrite evidences for effectively answering an online query. We next propose a step-by-step LLM rewrite method that iteratively leverages the retrieved evidences to select and arrange rewrite rules with self-reflection. We conduct comprehensive experiments on widely used benchmarks, and demonstrate the superior performance of our system, R-Bot, surpassing state-of-the-art query rewrite methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01661v1-abstract-full').style.display = 'none'; document.getElementById('2412.01661v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01425">arXiv:2412.01425</a> <span> [<a href="https://arxiv.org/pdf/2412.01425">pdf</a>, <a href="https://arxiv.org/format/2412.01425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Reject Threshold Adaptation for Open-Set Model Attribution of Deepfake Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yujie Chen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hao Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanjun Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junzuo Zhou</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yong Ren</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01425v1-abstract-short" style="display: inline;"> Open environment oriented open set model attribution of deepfake audio is an emerging research topic, aiming to identify the generation models of deepfake audio. Most previous work requires manually setting a rejection threshold for unknown classes to compare with predicted probabilities. However, models often overfit training instances and generate overly confident predictions. Moreover, threshol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01425v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01425v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01425v1-abstract-full" style="display: none;"> Open environment oriented open set model attribution of deepfake audio is an emerging research topic, aiming to identify the generation models of deepfake audio. Most previous work requires manually setting a rejection threshold for unknown classes to compare with predicted probabilities. However, models often overfit training instances and generate overly confident predictions. Moreover, thresholds that effectively distinguish unknown categories in the current dataset may not be suitable for identifying known and unknown categories in another data distribution. To address the issues, we propose a novel framework for open set model attribution of deepfake audio with rejection threshold adaptation (ReTA). Specifically, the reconstruction error learning module trains by combining the representation of system fingerprints with labels corresponding to either the target class or a randomly chosen other class label. This process generates matching and non-matching reconstructed samples, establishing the reconstruction error distributions for each class and laying the foundation for the reject threshold calculation module. The reject threshold calculation module utilizes gaussian probability estimation to fit the distributions of matching and non-matching reconstruction errors. It then computes adaptive reject thresholds for all classes through probability minimization criteria. The experimental results demonstrate the effectiveness of ReTA in improving the open set model attributes of deepfake audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01425v1-abstract-full').style.display = 'none'; document.getElementById('2412.01425v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00082">arXiv:2412.00082</a> <span> [<a href="https://arxiv.org/pdf/2412.00082">pdf</a>, <a href="https://arxiv.org/format/2412.00082">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Dual Prototyping with Domain and Class Prototypes for Affective Brain-Computer Interface in Unseen Target Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Guangli Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhehao Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tuo Sun</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+P">Ping Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhen Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00082v1-abstract-short" style="display: inline;"> EEG signals have emerged as a powerful tool in affective brain-computer interfaces, playing a crucial role in emotion recognition. However, current deep transfer learning-based methods for EEG recognition face challenges due to the reliance of both source and target data in model learning, which significantly affect model performance and generalization. To overcome this limitation, we propose a no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00082v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00082v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00082v1-abstract-full" style="display: none;"> EEG signals have emerged as a powerful tool in affective brain-computer interfaces, playing a crucial role in emotion recognition. However, current deep transfer learning-based methods for EEG recognition face challenges due to the reliance of both source and target data in model learning, which significantly affect model performance and generalization. To overcome this limitation, we propose a novel framework (PL-DCP) and introduce the concepts of feature disentanglement and prototype inference. The dual prototyping mechanism incorporates both domain and class prototypes: domain prototypes capture individual variations across subjects, while class prototypes represent the ideal class distributions within their respective domains. Importantly, the proposed PL-DCP framework operates exclusively with source data during training, meaning that target data remains completely unseen throughout the entire process. To address label noise, we employ a pairwise learning strategy that encodes proximity relationships between sample pairs, effectively reducing the influence of mislabeled data. Experimental validation on the SEED and SEED-IV datasets demonstrates that PL-DCP, despite not utilizing target data during training, achieves performance comparable to deep transfer learning methods that require both source and target data. This highlights the potential of PL-DCP as an effective and robust approach for EEG-based emotion recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00082v1-abstract-full').style.display = 'none'; document.getElementById('2412.00082v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00069">arXiv:2412.00069</a> <span> [<a href="https://arxiv.org/pdf/2412.00069">pdf</a>, <a href="https://arxiv.org/format/2412.00069">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Condense, Don't Just Prune: Enhancing Efficiency and Performance in MoE Layer Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+M">Mingyu Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gen Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jie Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaolong Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+L">Lu Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00069v1-abstract-short" style="display: inline;"> Mixture-of-Experts (MOE) has garnered significant attention for their ability to scale up neural networks while utilizing the same or even fewer active parameters. However, MoE does not relieve the massive memory requirements of networks, which limits their practicality in real-world applications, especially in the era of large language models (LLMs). While recent work explores the possibility of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00069v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00069v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00069v1-abstract-full" style="display: none;"> Mixture-of-Experts (MOE) has garnered significant attention for their ability to scale up neural networks while utilizing the same or even fewer active parameters. However, MoE does not relieve the massive memory requirements of networks, which limits their practicality in real-world applications, especially in the era of large language models (LLMs). While recent work explores the possibility of removing entire layers of MoE to reduce memory, the performance degradation is still notable. In this paper, we propose Condense-MoE (CD-MoE} that, instead of dropping the entire MoE layer, condenses the big, sparse MoE layer into a small but dense layer with only a few experts that are activated for all tokens. Our approach is specifically designed for fine-grained MoE with shared experts, where Feed-Forward Networks are split into many small experts, with certain experts isolated to serve as shared experts that are always activated. We demonstrate the effectiveness of our method across multiple MoE models such as DeepSeekMoE and QwenMoE on various benchmarks. Specifically, for the DeepSeekMoE-16B model, our approach maintains nearly 90% of the average accuracy while reducing memory usage by 30% and enhancing inference speed by 30%. Moreover, we show that with lightweight expert fine-tuning, the pruned model can achieve further improvements on specific tasks. Our code are available at https://github.com/duterscmy/CD-MoE/tree/main. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00069v1-abstract-full').style.display = 'none'; document.getElementById('2412.00069v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19385">arXiv:2411.19385</a> <span> [<a href="https://arxiv.org/pdf/2411.19385">pdf</a>, <a href="https://arxiv.org/format/2411.19385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Zero-Forget Preservation of Semantic Communication Alignment in Distributed AI Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jingzhi Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19385v1-abstract-short" style="display: inline;"> Future communication networks are expected to connect massive distributed artificial intelligence (AI). Exploiting aligned priori knowledge of AI pairs, it is promising to convert high-dimensional data transmission into highly-compressed semantic communications (SC). However, to accommodate the local data distribution and user preferences, AIs generally adapt to different domains, which fundamenta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19385v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19385v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19385v1-abstract-full" style="display: none;"> Future communication networks are expected to connect massive distributed artificial intelligence (AI). Exploiting aligned priori knowledge of AI pairs, it is promising to convert high-dimensional data transmission into highly-compressed semantic communications (SC). However, to accommodate the local data distribution and user preferences, AIs generally adapt to different domains, which fundamentally distorts the SC alignment. In this paper, we propose a zero-forget domain adaptation (ZFDA) framework to preserve SC alignment. To prevent the DA from changing substantial neural parameters of AI, we design sparse additive modifications (SAM) to the parameters, which can be efficiently stored and switched-off to restore the SC alignment. To optimize the SAM, we decouple it into tractable continuous variables and a binary mask, and then handle the binary mask by a score-based optimization. Experimental evaluations on a SC system for image transmissions validate that the proposed framework perfectly preserves the SC alignment with almost no loss of DA performance, even improved in some cases, at a cost of less than 1% of additional memory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19385v1-abstract-full').style.display = 'none'; document.getElementById('2411.19385v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18814">arXiv:2411.18814</a> <span> [<a href="https://arxiv.org/pdf/2411.18814">pdf</a>, <a href="https://arxiv.org/format/2411.18814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unifying Generative and Dense Retrieval for Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liu Yang</a>, <a href="/search/cs?searchtype=author&query=Paischer%2C+F">Fabian Paischer</a>, <a href="/search/cs?searchtype=author&query=Hassani%2C+K">Kaveh Hassani</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiacheng Li</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+S">Shuai Shao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z+G">Zhang Gabriel Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yun He</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xue Feng</a>, <a href="/search/cs?searchtype=author&query=Noorshams%2C+N">Nima Noorshams</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sem Park</a>, <a href="/search/cs?searchtype=author&query=Long%2C+B">Bo Long</a>, <a href="/search/cs?searchtype=author&query=Nowak%2C+R+D">Robert D Nowak</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaoli Gao</a>, <a href="/search/cs?searchtype=author&query=Eghbalzadeh%2C+H">Hamid Eghbalzadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18814v2-abstract-short" style="display: inline;"> Sequential dense retrieval models utilize advanced sequence learning techniques to compute item and user representations, which are then used to rank relevant items for a user through inner product computation between the user and all item representations. However, this approach requires storing a unique representation for each item, resulting in significant memory requirements as the number of it… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18814v2-abstract-full').style.display = 'inline'; document.getElementById('2411.18814v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18814v2-abstract-full" style="display: none;"> Sequential dense retrieval models utilize advanced sequence learning techniques to compute item and user representations, which are then used to rank relevant items for a user through inner product computation between the user and all item representations. However, this approach requires storing a unique representation for each item, resulting in significant memory requirements as the number of items grow. In contrast, the recently proposed generative retrieval paradigm offers a promising alternative by directly predicting item indices using a generative model trained on semantic IDs that encapsulate items' semantic information. Despite its potential for large-scale applications, a comprehensive comparison between generative retrieval and sequential dense retrieval under fair conditions is still lacking, leaving open questions regarding performance, and computation trade-offs. To address this, we compare these two approaches under controlled conditions on academic benchmarks and propose LIGER (LeveragIng dense retrieval for GEnerative Retrieval), a hybrid model that combines the strengths of these two widely used methods. LIGER integrates sequential dense retrieval into generative retrieval, mitigating performance differences and enhancing cold-start item recommendation in the datasets evaluated. This hybrid approach provides insights into the trade-offs between these approaches and demonstrates improvements in efficiency and effectiveness for recommendation systems in small-scale benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18814v2-abstract-full').style.display = 'none'; document.getElementById('2411.18814v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17990">arXiv:2411.17990</a> <span> [<a href="https://arxiv.org/pdf/2411.17990">pdf</a>, <a href="https://arxiv.org/format/2411.17990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Beam Switching Based Beam Design for High-Speed Train mmWave Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingjia Huang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenhao Qi</a>, <a href="/search/cs?searchtype=author&query=Dobre%2C+O+A">Octavia A. Dobre</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17990v1-abstract-short" style="display: inline;"> For high-speed train (HST) millimeter wave (mmWave) communications, the use of narrow beams with small beam coverage needs frequent beam switching, while wider beams with small beam gain leads to weaker mmWave signal strength. In this paper, we consider beam switching based beam design, which is formulated as an optimization problem aiming to minimize the number of switched beams within a predeter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17990v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17990v1-abstract-full" style="display: none;"> For high-speed train (HST) millimeter wave (mmWave) communications, the use of narrow beams with small beam coverage needs frequent beam switching, while wider beams with small beam gain leads to weaker mmWave signal strength. In this paper, we consider beam switching based beam design, which is formulated as an optimization problem aiming to minimize the number of switched beams within a predetermined railway range subject to that the receiving signal-to-noise ratio (RSNR) at the HST is no lower than a predetermined threshold. To solve this problem, we propose two sequential beam design schemes, both including two alternately-performed stages. In the first stage, given an updated beam coverage according to the railway range, we transform the problem into a feasibility problem and further convert it into a min-max optimization problem by relaxing the RSNR constraints into a penalty of the objective function. In the second stage, we evaluate the feasibility of the beamformer obtained from solving the min-max problem and determine the beam coverage accordingly. Simulation results show that compared to the first scheme, the second scheme can achieve 96.20\% reduction in computational complexity at the cost of only 0.0657\% performance degradation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17990v1-abstract-full').style.display = 'none'; document.getElementById('2411.17990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16579">arXiv:2411.16579</a> <span> [<a href="https://arxiv.org/pdf/2411.16579">pdf</a>, <a href="https://arxiv.org/format/2411.16579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing LLM Reasoning via Critique Models with Test-Time and Training-Time Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xi%2C+Z">Zhiheng Xi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dingwen Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jixuan Huang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiafu Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanyu Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yiwen Ding</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wei He</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+B">Boyang Hong</a>, <a href="/search/cs?searchtype=author&query=Do%2C+S">Shihan Do</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+W">Wenyu Zhan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+R">Rui Zheng</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+T">Tao Ji</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaowei Shi</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yitao Zhai</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+R">Rongxiang Weng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingang Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xunliang Cai</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+T">Tao Gui</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zuxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16579v1-abstract-short" style="display: inline;"> Training large language models (LLMs) to spend more time thinking and reflection before responding is crucial for effectively solving complex reasoning tasks in fields such as science, coding, and mathematics. However, the effectiveness of mechanisms like self-reflection and self-correction depends on the model's capacity to accurately assess its own performance, which can be limited by factors su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16579v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16579v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16579v1-abstract-full" style="display: none;"> Training large language models (LLMs) to spend more time thinking and reflection before responding is crucial for effectively solving complex reasoning tasks in fields such as science, coding, and mathematics. However, the effectiveness of mechanisms like self-reflection and self-correction depends on the model's capacity to accurately assess its own performance, which can be limited by factors such as initial accuracy, question difficulty, and the lack of external feedback. In this paper, we delve into a two-player paradigm that separates the roles of reasoning and critique models, where the critique model provides step-level feedback to supervise the reasoning (actor) model during both test-time and train-time. We first propose AutoMathCritique, an automated and scalable framework for collecting critique data, resulting in a dataset of $76,321$ responses paired with step-level feedback. Fine-tuning language models with this dataset enables them to generate natural language feedback for mathematical reasoning. We demonstrate that the critique models consistently improve the actor's performance on difficult queries at test-time, especially when scaling up inference-time computation. Motivated by these findings, we introduce the critique-based supervision to the actor's self-training process, and propose a critique-in-the-loop self-improvement method. Experiments show that the method improves the actor's exploration efficiency and solution diversity, especially on challenging queries, leading to a stronger reasoning model. Lastly, we take the preliminary step to explore training self-talk reasoning models via critique supervision and showcase its potential. Our code and datasets are at \href{https://mathcritique.github.io/}{https://mathcritique.github.io/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16579v1-abstract-full').style.display = 'none'; document.getElementById('2411.16579v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16216">arXiv:2411.16216</a> <span> [<a href="https://arxiv.org/pdf/2411.16216">pdf</a>, <a href="https://arxiv.org/format/2411.16216">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SMGDiff: Soccer Motion Generation using diffusion probabilistic models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongdi Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengyang Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gaozheng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingya Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jingyi Yu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zhuo Su</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Lan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16216v1-abstract-short" style="display: inline;"> Soccer is a globally renowned sport with significant applications in video games and VR/AR. However, generating realistic soccer motions remains challenging due to the intricate interactions between the human player and the ball. In this paper, we introduce SMGDiff, a novel two-stage framework for generating real-time and user-controllable soccer motions. Our key idea is to integrate real-time cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16216v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16216v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16216v1-abstract-full" style="display: none;"> Soccer is a globally renowned sport with significant applications in video games and VR/AR. However, generating realistic soccer motions remains challenging due to the intricate interactions between the human player and the ball. In this paper, we introduce SMGDiff, a novel two-stage framework for generating real-time and user-controllable soccer motions. Our key idea is to integrate real-time character control with a powerful diffusion-based generative model, ensuring high-quality and diverse output motion. In the first stage, we instantly transform coarse user controls into diverse global trajectories of the character. In the second stage, we employ a transformer-based autoregressive diffusion model to generate soccer motions based on trajectory conditioning. We further incorporate a contact guidance module during inference to optimize the contact details for realistic ball-foot interactions. Moreover, we contribute a large-scale soccer motion dataset consisting of over 1.08 million frames of diverse soccer motions. Extensive experiments demonstrate that our SMGDiff significantly outperforms existing methods in terms of motion quality and condition alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16216v1-abstract-full').style.display = 'none'; document.getElementById('2411.16216v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16061">arXiv:2411.16061</a> <span> [<a href="https://arxiv.org/pdf/2411.16061">pdf</a>, <a href="https://arxiv.org/format/2411.16061">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Scaling Spike-driven Transformer with Efficient Spike Firing Approximation Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+M">Man Yao</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xuerui Qiu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiakui Hu</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+Y">Yuhong Chou</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+K">Keyu Tian</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jianxing Liao</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+L">Luziwei Leng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bo Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16061v1-abstract-short" style="display: inline;"> The ambition of brain-inspired Spiking Neural Networks (SNNs) is to become a low-power alternative to traditional Artificial Neural Networks (ANNs). This work addresses two major challenges in realizing this vision: the performance gap between SNNs and ANNs, and the high training costs of SNNs. We identify intrinsic flaws in spiking neurons caused by binary firing mechanisms and propose a Spike Fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16061v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16061v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16061v1-abstract-full" style="display: none;"> The ambition of brain-inspired Spiking Neural Networks (SNNs) is to become a low-power alternative to traditional Artificial Neural Networks (ANNs). This work addresses two major challenges in realizing this vision: the performance gap between SNNs and ANNs, and the high training costs of SNNs. We identify intrinsic flaws in spiking neurons caused by binary firing mechanisms and propose a Spike Firing Approximation (SFA) method using integer training and spike-driven inference. This optimizes the spike firing pattern of spiking neurons, enhancing efficient training, reducing power consumption, improving performance, enabling easier scaling, and better utilizing neuromorphic chips. We also develop an efficient spike-driven Transformer architecture and a spike-masked autoencoder to prevent performance degradation during SNN scaling. On ImageNet-1k, we achieve state-of-the-art top-1 accuracy of 78.5\%, 79.8\%, 84.0\%, and 86.2\% with models containing 10M, 19M, 83M, and 173M parameters, respectively. For instance, the 10M model outperforms the best existing SNN by 7.2\% on ImageNet, with training time acceleration and inference energy efficiency improved by 4.5$\times$ and 3.9$\times$, respectively. We validate the effectiveness and efficiency of the proposed method across various tasks, including object detection, semantic segmentation, and neuromorphic vision tasks. This work enables SNNs to match ANN performance while maintaining the low-power advantage, marking a significant step towards SNNs as a general visual backbone. Code is available at https://github.com/BICLab/Spike-Driven-Transformer-V3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16061v1-abstract-full').style.display = 'none'; document.getElementById('2411.16061v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15255">arXiv:2411.15255</a> <span> [<a href="https://arxiv.org/pdf/2411.15255">pdf</a>, <a href="https://arxiv.org/format/2411.15255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OSMamba: Omnidirectional Spectral Mamba with Dual-Domain Prior Generator for Exposure Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Gehui Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15255v1-abstract-short" style="display: inline;"> Exposure correction is a fundamental problem in computer vision and image processing. Recently, frequency domain-based methods have achieved impressive improvement, yet they still struggle with complex real-world scenarios under extreme exposure conditions. This is due to the local convolutional receptive fields failing to model long-range dependencies in the spectrum, and the non-generative learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15255v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15255v1-abstract-full" style="display: none;"> Exposure correction is a fundamental problem in computer vision and image processing. Recently, frequency domain-based methods have achieved impressive improvement, yet they still struggle with complex real-world scenarios under extreme exposure conditions. This is due to the local convolutional receptive fields failing to model long-range dependencies in the spectrum, and the non-generative learning paradigm being inadequate for retrieving lost details from severely degraded regions. In this paper, we propose Omnidirectional Spectral Mamba (OSMamba), a novel exposure correction network that incorporates the advantages of state space models and generative diffusion models to address these limitations. Specifically, OSMamba introduces an omnidirectional spectral scanning mechanism that adapts Mamba to the frequency domain to capture comprehensive long-range dependencies in both the amplitude and phase spectra of deep image features, hence enhancing illumination correction and structure recovery. Furthermore, we develop a dual-domain prior generator that learns from well-exposed images to generate a degradation-free diffusion prior containing correct information about severely under- and over-exposed regions for better detail restoration. Extensive experiments on multiple-exposure and mixed-exposure datasets demonstrate that the proposed OSMamba achieves state-of-the-art performance both quantitatively and qualitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15255v1-abstract-full').style.display = 'none'; document.getElementById('2411.15255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15205">arXiv:2411.15205</a> <span> [<a href="https://arxiv.org/pdf/2411.15205">pdf</a>, <a href="https://arxiv.org/format/2411.15205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> DAGSM: Disentangled Avatar Generation with GS-enhanced Mesh </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuang%2C+J">Jingyu Zhuang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+D">Di Kang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+L">Linchao Bao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Liang Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanbin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15205v1-abstract-short" style="display: inline;"> Text-driven avatar generation has gained significant attention owing to its convenience. However, existing methods typically model the human body with all garments as a single 3D model, limiting its usability, such as clothing replacement, and reducing user control over the generation process. To overcome the limitations above, we propose DAGSM, a novel pipeline that generates disentangled human b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15205v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15205v1-abstract-full" style="display: none;"> Text-driven avatar generation has gained significant attention owing to its convenience. However, existing methods typically model the human body with all garments as a single 3D model, limiting its usability, such as clothing replacement, and reducing user control over the generation process. To overcome the limitations above, we propose DAGSM, a novel pipeline that generates disentangled human bodies and garments from the given text prompts. Specifically, we model each part (e.g., body, upper/lower clothes) of the clothed human as one GS-enhanced mesh (GSM), which is a traditional mesh attached with 2D Gaussians to better handle complicated textures (e.g., woolen, translucent clothes) and produce realistic cloth animations. During the generation, we first create the unclothed body, followed by a sequence of individual cloth generation based on the body, where we introduce a semantic-based algorithm to achieve better human-cloth and garment-garment separation. To improve texture quality, we propose a view-consistent texture refinement module, including a cross-view attention mechanism for texture style consistency and an incident-angle-weighted denoising (IAW-DE) strategy to update the appearance. Extensive experiments have demonstrated that DAGSM generates high-quality disentangled avatars, supports clothing replacement and realistic animation, and outperforms the baselines in visual quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15205v1-abstract-full').style.display = 'none'; document.getElementById('2411.15205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14738">arXiv:2411.14738</a> <span> [<a href="https://arxiv.org/pdf/2411.14738">pdf</a>, <a href="https://arxiv.org/format/2411.14738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Universal and Context-Independent Triggers for Precise Control of LLM Outputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jiashuo Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guancheng Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14738v1-abstract-short" style="display: inline;"> Large language models (LLMs) have been widely adopted in applications such as automated content generation and even critical decision-making systems. However, the risk of prompt injection allows for potential manipulation of LLM outputs. While numerous attack methods have been documented, achieving full control over these outputs remains challenging, often requiring experienced attackers to make m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14738v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14738v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14738v1-abstract-full" style="display: none;"> Large language models (LLMs) have been widely adopted in applications such as automated content generation and even critical decision-making systems. However, the risk of prompt injection allows for potential manipulation of LLM outputs. While numerous attack methods have been documented, achieving full control over these outputs remains challenging, often requiring experienced attackers to make multiple attempts and depending heavily on the prompt context. Recent advancements in gradient-based white-box attack techniques have shown promise in tasks like jailbreaks and system prompt leaks. Our research generalizes gradient-based attacks to find a trigger that is (1) Universal: effective irrespective of the target output; (2) Context-Independent: robust across diverse prompt contexts; and (3) Precise Output: capable of manipulating LLM inputs to yield any specified output with high accuracy. We propose a novel method to efficiently discover such triggers and assess the effectiveness of the proposed attack. Furthermore, we discuss the substantial threats posed by such attacks to LLM-based applications, highlighting the potential for adversaries to taking over the decisions and actions made by AI agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14738v1-abstract-full').style.display = 'none'; document.getElementById('2411.14738v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13876">arXiv:2411.13876</a> <span> [<a href="https://arxiv.org/pdf/2411.13876">pdf</a>, <a href="https://arxiv.org/format/2411.13876">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Iterative decoding of short BCH codes and its post-processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Guangwen Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13876v1-abstract-short" style="display: inline;"> Effective iterative decoding of short BCH codes faces two primary challenges: identifying an appropriate parity-check matrix and accelerating decoder convergence. To address these issues, we propose a systematic scheme to derive an optimized parity-check matrix through a heuristic approach. This involves a series of binary sum and row shift operations, resulting in a low-density, quasi-regular col… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13876v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13876v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13876v1-abstract-full" style="display: none;"> Effective iterative decoding of short BCH codes faces two primary challenges: identifying an appropriate parity-check matrix and accelerating decoder convergence. To address these issues, we propose a systematic scheme to derive an optimized parity-check matrix through a heuristic approach. This involves a series of binary sum and row shift operations, resulting in a low-density, quasi-regular column weight distribution with a reduced number of shortest cycles in the underlying redundant Tanner graph. For the revised normalized min-sum decoder, we concurrently integrate three types of random permutations into the alternated messages across iterations, leading to significantly faster convergence compared to existing methods. Furthermore, by utilizing the iterative trajectories of failed normalized min-sum decoding, we enhance the reliability measurement of codeword bits with the assistance of a neural network model from prior work, which accommodates more failures for the post-processing of ordered statistics decoding. Additionally, we report the types of undetected errors for the design of iterative decoders for short BCH codes, which potentially challenge efforts to approach the maximum likelihood limit. Extensive simulations demonstrate that the proposed hybrid framework achieves an attractive balance between performance, latency, and complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13876v1-abstract-full').style.display = 'none'; document.getElementById('2411.13876v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 6 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13626">arXiv:2411.13626</a> <span> [<a href="https://arxiv.org/pdf/2411.13626">pdf</a>, <a href="https://arxiv.org/format/2411.13626">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Principles of Visual Tokens for Efficient Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+X">Xinyue Hao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gen Li</a>, <a href="/search/cs?searchtype=author&query=Gowda%2C+S+N">Shreyank N Gowda</a>, <a href="/search/cs?searchtype=author&query=Fisher%2C+R+B">Robert B Fisher</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jonathan Huang</a>, <a href="/search/cs?searchtype=author&query=Arnab%2C+A">Anurag Arnab</a>, <a href="/search/cs?searchtype=author&query=Sevilla-Lara%2C+L">Laura Sevilla-Lara</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13626v1-abstract-short" style="display: inline;"> Video understanding has made huge strides in recent years, relying largely on the power of the transformer architecture. As this architecture is notoriously expensive and video is highly redundant, research into improving efficiency has become particularly relevant. This has led to many creative solutions, including token merging and token selection. While most methods succeed in reducing the cost… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13626v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13626v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13626v1-abstract-full" style="display: none;"> Video understanding has made huge strides in recent years, relying largely on the power of the transformer architecture. As this architecture is notoriously expensive and video is highly redundant, research into improving efficiency has become particularly relevant. This has led to many creative solutions, including token merging and token selection. While most methods succeed in reducing the cost of the model and maintaining accuracy, an interesting pattern arises: most methods do not outperform the random sampling baseline. In this paper we take a closer look at this phenomenon and make several observations. First, we develop an oracle for the value of tokens which exposes a clear Pareto distribution where most tokens have remarkably low value, and just a few carry most of the perceptual information. Second, we analyze why this oracle is extremely hard to learn, as it does not consistently coincide with visual cues. Third, we observe that easy videos need fewer tokens to maintain accuracy. We build on these and further insights to propose a lightweight video model we call LITE that can select a small number of tokens effectively, outperforming state-of-the-art and existing baselines across datasets (Kinetics400 and Something-Something-V2) in the challenging trade-off of computation (GFLOPs) vs accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13626v1-abstract-full').style.display = 'none'; document.getElementById('2411.13626v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13383">arXiv:2411.13383</a> <span> [<a href="https://arxiv.org/pdf/2411.13383">pdf</a>, <a href="https://arxiv.org/format/2411.13383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Diffusion Compression for Real-World Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gehui Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Rongyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xindong Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13383v1-abstract-short" style="display: inline;"> Real-world image super-resolution (Real-ISR) aims to reconstruct high-resolution images from low-resolution inputs degraded by complex, unknown processes. While many Stable Diffusion (SD)-based Real-ISR methods have achieved remarkable success, their slow, multi-step inference hinders practical deployment. Recent SD-based one-step networks like OSEDiff and S3Diff alleviate this issue but still inc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13383v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13383v1-abstract-full" style="display: none;"> Real-world image super-resolution (Real-ISR) aims to reconstruct high-resolution images from low-resolution inputs degraded by complex, unknown processes. While many Stable Diffusion (SD)-based Real-ISR methods have achieved remarkable success, their slow, multi-step inference hinders practical deployment. Recent SD-based one-step networks like OSEDiff and S3Diff alleviate this issue but still incur high computational costs due to their reliance on large pretrained SD models. This paper proposes a novel Real-ISR method, AdcSR, by distilling the one-step diffusion network OSEDiff into a streamlined diffusion-GAN model under our Adversarial Diffusion Compression (ADC) framework. We meticulously examine the modules of OSEDiff, categorizing them into two types: (1) Removable (VAE encoder, prompt extractor, text encoder, etc.) and (2) Prunable (denoising UNet and VAE decoder). Since direct removal and pruning can degrade the model's generation capability, we pretrain our pruned VAE decoder to restore its ability to decode images and employ adversarial distillation to compensate for performance loss. This ADC-based diffusion-GAN hybrid design effectively reduces complexity by 73% in inference time, 78% in computation, and 74% in parameters, while preserving the model's generation capability. Experiments manifest that our proposed AdcSR achieves competitive recovery quality on both synthetic and real-world datasets, offering up to 9.3$\times$ speedup over previous one-step diffusion-based methods. Code and models will be made available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13383v1-abstract-full').style.display = 'none'; document.getElementById('2411.13383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13183">arXiv:2411.13183</a> <span> [<a href="https://arxiv.org/pdf/2411.13183">pdf</a>, <a href="https://arxiv.org/format/2411.13183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ClickTrack: Towards Real-time Interactive Single Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kuiran Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xuehui Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenwen Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guorong Li</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Q">Qixiang Ye</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jianbin Jiao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhenjun Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13183v2-abstract-short" style="display: inline;"> Single object tracking(SOT) relies on precise object bounding box initialization. In this paper, we reconsidered the deficiencies in the current approaches to initializing single object trackers and propose a new paradigm for single object tracking algorithms, ClickTrack, a new paradigm using clicking interaction for real-time scenarios. Moreover, click as an input type inherently lack hierarchica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13183v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13183v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13183v2-abstract-full" style="display: none;"> Single object tracking(SOT) relies on precise object bounding box initialization. In this paper, we reconsidered the deficiencies in the current approaches to initializing single object trackers and propose a new paradigm for single object tracking algorithms, ClickTrack, a new paradigm using clicking interaction for real-time scenarios. Moreover, click as an input type inherently lack hierarchical information. To address ambiguity in certain special scenarios, we designed the Guided Click Refiner(GCR), which accepts point and optional textual information as inputs, transforming the point into the bounding box expected by the operator. The bounding box will be used as input of single object trackers. Experiments on LaSOT and GOT-10k benchmarks show that tracker combined with GCR achieves stable performance in real-time interactive scenarios. Furthermore, we explored the integration of GCR into the Segment Anything model(SAM), significantly reducing ambiguity issues when SAM receives point inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13183v2-abstract-full').style.display = 'none'; document.getElementById('2411.13183v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13093">arXiv:2411.13093</a> <span> [<a href="https://arxiv.org/pdf/2411.13093">pdf</a>, <a href="https://arxiv.org/format/2411.13093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Video-RAG: Visually-aligned Retrieval-Augmented Long Video Comprehension </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yongdong Luo</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiawu Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiao Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guilin Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haojia Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jinfa Huang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jiayi Ji</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+F">Fei Chao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+R">Rongrong Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13093v1-abstract-short" style="display: inline;"> Existing large video-language models (LVLMs) struggle to comprehend long videos correctly due to limited context. To address this problem, fine-tuning long-context LVLMs and employing GPT-based agents have emerged as promising solutions. However, fine-tuning LVLMs would require extensive high-quality data and substantial GPU resources, while GPT-based agents would rely on proprietary models (e.g.,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13093v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13093v1-abstract-full" style="display: none;"> Existing large video-language models (LVLMs) struggle to comprehend long videos correctly due to limited context. To address this problem, fine-tuning long-context LVLMs and employing GPT-based agents have emerged as promising solutions. However, fine-tuning LVLMs would require extensive high-quality data and substantial GPU resources, while GPT-based agents would rely on proprietary models (e.g., GPT-4o). In this paper, we propose Video Retrieval-Augmented Generation (Video-RAG), a training-free and cost-effective pipeline that employs visually-aligned auxiliary texts to help facilitate cross-modality alignment while providing additional information beyond the visual content. Specifically, we leverage open-source external tools to extract visually-aligned information from pure video data (e.g., audio, optical character, and object detection), and incorporate the extracted information into an existing LVLM as auxiliary texts, alongside video frames and queries, in a plug-and-play manner. Our Video-RAG offers several key advantages: (i) lightweight with low computing overhead due to single-turn retrieval; (ii) easy implementation and compatibility with any LVLM; and (iii) significant, consistent performance gains across long video understanding benchmarks, including Video-MME, MLVU, and LongVideoBench. Notably, our model demonstrates superior performance over proprietary models like Gemini-1.5-Pro and GPT-4o when utilized with a 72B model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13093v1-abstract-full').style.display = 'none'; document.getElementById('2411.13093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12981">arXiv:2411.12981</a> <span> [<a href="https://arxiv.org/pdf/2411.12981">pdf</a>, <a href="https://arxiv.org/format/2411.12981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GazeGaussian: High-Fidelity Gaze Redirection with 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xiaobao Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peng Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guangyu Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Ming Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Feng Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12981v1-abstract-short" style="display: inline;"> Gaze estimation encounters generalization challenges when dealing with out-of-distribution data. To address this problem, recent methods use neural radiance fields (NeRF) to generate augmented data. However, existing methods based on NeRF are computationally expensive and lack facial details. 3D Gaussian Splatting (3DGS) has become the prevailing representation of neural fields. While 3DGS has bee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12981v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12981v1-abstract-full" style="display: none;"> Gaze estimation encounters generalization challenges when dealing with out-of-distribution data. To address this problem, recent methods use neural radiance fields (NeRF) to generate augmented data. However, existing methods based on NeRF are computationally expensive and lack facial details. 3D Gaussian Splatting (3DGS) has become the prevailing representation of neural fields. While 3DGS has been extensively examined in head avatars, it faces challenges with accurate gaze control and generalization across different subjects. In this work, we propose GazeGaussian, a high-fidelity gaze redirection method that uses a two-stream 3DGS model to represent the face and eye regions separately. By leveraging the unstructured nature of 3DGS, we develop a novel eye representation for rigid eye rotation based on the target gaze direction. To enhance synthesis generalization across various subjects, we integrate an expression-conditional module to guide the neural renderer. Comprehensive experiments show that GazeGaussian outperforms existing methods in rendering speed, gaze redirection accuracy, and facial synthesis across multiple datasets. We also demonstrate that existing gaze estimation methods can leverage GazeGaussian to improve their generalization performance. The code will be available at: https://ucwxb.github.io/GazeGaussian/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12981v1-abstract-full').style.display = 'none'; document.getElementById('2411.12981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11982">arXiv:2411.11982</a> <span> [<a href="https://arxiv.org/pdf/2411.11982">pdf</a>, <a href="https://arxiv.org/format/2411.11982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> HPA-MPC: Hybrid Perception-Aware Nonlinear Model Predictive Control for Quadrotors with Suspended Loads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sarvaiya%2C+M">Mrunal Sarvaiya</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanrui Li</a>, <a href="/search/cs?searchtype=author&query=Loianno%2C+G">Giuseppe Loianno</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11982v1-abstract-short" style="display: inline;"> Quadrotors equipped with cable-suspended loads represent a versatile, low-cost, and energy efficient solution for aerial transportation, construction, and manipulation tasks. However, their real-world deployment is hindered by several challenges. The system is difficult to control because it is nonlinear, underactuated, involves hybrid dynamics due to slack-taut cable modes, and evolves on complex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11982v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11982v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11982v1-abstract-full" style="display: none;"> Quadrotors equipped with cable-suspended loads represent a versatile, low-cost, and energy efficient solution for aerial transportation, construction, and manipulation tasks. However, their real-world deployment is hindered by several challenges. The system is difficult to control because it is nonlinear, underactuated, involves hybrid dynamics due to slack-taut cable modes, and evolves on complex configuration spaces. Additionally, it is crucial to estimate the full state and the cable's mode transitions in real-time using on-board sensors and computation. To address these challenges, we present a novel Hybrid Perception-Aware Nonlinear Model Predictive Control (HPA-MPC) control approach for quadrotors with suspended loads. Our method considers the complete hybrid system dynamics and includes a perception-aware cost to ensure the payload remains visible in the robot's camera during navigation. Furthermore, the full state and hybrid dynamics' transitions are estimated using onboard sensors. Experimental results demonstrate that our approach enables stable load tracking control, even during slack-taut transitions, and operates entirely onboard. The experiments also show that the perception-aware term effectively keeps the payload in the robot's camera field of view when a human operator interacts with the load. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11982v1-abstract-full').style.display = 'none'; document.getElementById('2411.11982v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE Robotics and Automation Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11581">arXiv:2411.11581</a> <span> [<a href="https://arxiv.org/pdf/2411.11581">pdf</a>, <a href="https://arxiv.org/format/2411.11581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OASIS: Open Agent Social Interaction Simulations with One Million Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zaibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zirui Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuxian Jiang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Ziyue Gan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zijian Ling</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinsong Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Martz Ma</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+B">Bowen Dong</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+P">Prateek Gupta</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyue Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenfei Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guohao Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xu Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijun Wang</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chaochao Lu</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jing Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11581v4-abstract-short" style="display: inline;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a parti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v4-abstract-full').style.display = 'inline'; document.getElementById('2411.11581v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11581v4-abstract-full" style="display: none;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a particular scenario, making it time-consuming and resource-intensive to explore other phenomena using the same ABM. Additionally, these models simulate only a limited number of agents, whereas real-world social media platforms involve millions of users. To this end, we propose OASIS, a generalizable and scalable social media simulator. OASIS is designed based on real-world social media platforms, incorporating dynamically updated environments (i.e., dynamic social networks and post information), diverse action spaces (i.e., following, commenting), and recommendation systems (i.e., interest-based and hot-score-based). Additionally, OASIS supports large-scale user simulations, capable of modeling up to one million users. With these features, OASIS can be easily extended to different social media platforms to study large-scale group phenomena and behaviors. We replicate various social phenomena, including information spreading, group polarization, and herd effects across X and Reddit platforms. Moreover, we provide observations of social phenomena at different agent group scales. We observe that the larger agent group scale leads to more enhanced group dynamics and more diverse and helpful agents' opinions. These findings demonstrate OASIS's potential as a powerful tool for studying complex systems in digital environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v4-abstract-full').style.display = 'none'; document.getElementById('2411.11581v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10831">arXiv:2411.10831</a> <span> [<a href="https://arxiv.org/pdf/2411.10831">pdf</a>, <a href="https://arxiv.org/format/2411.10831">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neighboring Slice Noise2Noise: Self-Supervised Medical Image Denoising from Single Noisy Image Volume </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Langrui Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziteng Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xinyu Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huiru Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10831v2-abstract-short" style="display: inline;"> In the last few years, with the rapid development of deep learning technologies, supervised methods based on convolutional neural networks have greatly enhanced the performance of medical image denoising. However, these methods require large quantities of noisy-clean image pairs for training, which greatly limits their practicality. Although some researchers have attempted to train denoising netwo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10831v2-abstract-full').style.display = 'inline'; document.getElementById('2411.10831v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10831v2-abstract-full" style="display: none;"> In the last few years, with the rapid development of deep learning technologies, supervised methods based on convolutional neural networks have greatly enhanced the performance of medical image denoising. However, these methods require large quantities of noisy-clean image pairs for training, which greatly limits their practicality. Although some researchers have attempted to train denoising networks using only single noisy images, existing self-supervised methods, including blind-spot-based and data-splitting-based methods, heavily rely on the assumption that noise is pixel-wise independent. However, this assumption often does not hold in real-world medical images. Therefore, in the field of medical imaging, there remains a lack of simple and practical denoising methods that can achieve high-quality denoising performance using only single noisy images. In this paper, we propose a novel self-supervised medical image denoising method, Neighboring Slice Noise2Noise (NS-N2N). The proposed method utilizes neighboring slices within a single noisy image volume to construct weighted training data, and then trains the denoising network using a self-supervised scheme with regional consistency loss and inter-slice continuity loss. NS-N2N only requires a single noisy image volume obtained from one medical imaging procedure to achieve high-quality denoising of the image volume itself. Extensive experiments demonstrate that the proposed method outperforms state-of-the-art self-supervised denoising methods in both denoising performance and processing efficiency. Furthermore, since NS-N2N operates solely in the image domain, it is free from device-specific issues such as reconstruction geometry, making it easier to apply in various clinical practices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10831v2-abstract-full').style.display = 'none'; document.getElementById('2411.10831v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10741">arXiv:2411.10741</a> <span> [<a href="https://arxiv.org/pdf/2411.10741">pdf</a>, <a href="https://arxiv.org/format/2411.10741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MetaLA: Unified Optimal Linear Approximation to Softmax Attention Map </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chou%2C+Y">Yuhong Chou</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+M">Man Yao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kexin Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yuqi Pan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+R">Ruijie Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yiran Zhong</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jibin Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bo Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10741v1-abstract-short" style="display: inline;"> Various linear complexity models, such as Linear Transformer (LinFormer), State Space Model (SSM), and Linear RNN (LinRNN), have been proposed to replace the conventional softmax attention in Transformer structures. However, the optimal design of these linear models is still an open question. In this work, we attempt to answer this question by finding the best linear approximation to softmax atten… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10741v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10741v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10741v1-abstract-full" style="display: none;"> Various linear complexity models, such as Linear Transformer (LinFormer), State Space Model (SSM), and Linear RNN (LinRNN), have been proposed to replace the conventional softmax attention in Transformer structures. However, the optimal design of these linear models is still an open question. In this work, we attempt to answer this question by finding the best linear approximation to softmax attention from a theoretical perspective. We start by unifying existing linear complexity models as the linear attention form and then identify three conditions for the optimal linear attention design: 1) Dynamic memory ability; 2) Static approximation ability; 3) Least parameter approximation. We find that none of the current linear models meet all three conditions, resulting in suboptimal performance. Instead, we propose Meta Linear Attention (MetaLA) as a solution that satisfies these conditions. Our experiments on Multi-Query Associative Recall (MQAR) task, language modeling, image classification, and Long-Range Arena (LRA) benchmark demonstrate that MetaLA is more effective than the existing linear models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10741v1-abstract-full').style.display = 'none'; document.getElementById('2411.10741v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10336">arXiv:2411.10336</a> <span> [<a href="https://arxiv.org/pdf/2411.10336">pdf</a>, <a href="https://arxiv.org/format/2411.10336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> BMP: Bridging the Gap between B-Spline and Movement Primitives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+W">Weiran Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Lioutikov%2C+R">Rudolf Lioutikov</a>, <a href="/search/cs?searchtype=author&query=Neumann%2C+G">Gerhard Neumann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10336v1-abstract-short" style="display: inline;"> This work introduces B-spline Movement Primitives (BMPs), a new Movement Primitive (MP) variant that leverages B-splines for motion representation. B-splines are a well-known concept in motion planning due to their ability to generate complex, smooth trajectories with only a few control points while satisfying boundary conditions, i.e., passing through a specified desired position with desired vel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10336v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10336v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10336v1-abstract-full" style="display: none;"> This work introduces B-spline Movement Primitives (BMPs), a new Movement Primitive (MP) variant that leverages B-splines for motion representation. B-splines are a well-known concept in motion planning due to their ability to generate complex, smooth trajectories with only a few control points while satisfying boundary conditions, i.e., passing through a specified desired position with desired velocity. However, current usages of B-splines tend to ignore the higher-order statistics in trajectory distributions, which limits their usage in imitation learning (IL) and reinforcement learning (RL), where modeling trajectory distribution is essential. In contrast, MPs are commonly used in IL and RL for their capacity to capture trajectory likelihoods and correlations. However, MPs are constrained by their abilities to satisfy boundary conditions and usually need extra terms in learning objectives to satisfy velocity constraints. By reformulating B-splines as MPs, represented through basis functions and weight parameters, BMPs combine the strengths of both approaches, allowing B-splines to capture higher-order statistics while retaining their ability to satisfy boundary conditions. Empirical results in IL and RL demonstrate that BMPs broaden the applicability of B-splines in robot learning and offer greater expressiveness compared to existing MP variants. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10336v1-abstract-full').style.display = 'none'; document.getElementById('2411.10336v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07612">arXiv:2411.07612</a> <span> [<a href="https://arxiv.org/pdf/2411.07612">pdf</a>, <a href="https://arxiv.org/format/2411.07612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Joint Prediction Method of Multi-Agent to Reduce Collision Rate </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingyi Wang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H">Hongqun Zou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">You Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07612v3-abstract-short" style="display: inline;"> Predicting future motions of road participants is an important task for driving autonomously. Most existing models excel at predicting the marginal trajectory of a single agent, but predicting joint trajectories for multiple agents that are consistent within a scene remains a challenge. Previous research has often focused on marginal predictions, but the importance of joint predictions has become… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07612v3-abstract-full').style.display = 'inline'; document.getElementById('2411.07612v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07612v3-abstract-full" style="display: none;"> Predicting future motions of road participants is an important task for driving autonomously. Most existing models excel at predicting the marginal trajectory of a single agent, but predicting joint trajectories for multiple agents that are consistent within a scene remains a challenge. Previous research has often focused on marginal predictions, but the importance of joint predictions has become increasingly apparent. Joint prediction aims to generate trajectories that are consistent across the entire scene. Our research builds upon the SIMPL baseline to explore methods for generating scene-consistent trajectories. We tested our algorithm on the Argoverse 2 dataset, and experimental results demonstrate that our approach can generate scene-consistent trajectories. Compared to the SIMPL baseline, our method significantly reduces the collision rate of joint trajectories within the scene. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07612v3-abstract-full').style.display = 'none'; document.getElementById('2411.07612v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07112">arXiv:2411.07112</a> <span> [<a href="https://arxiv.org/pdf/2411.07112">pdf</a>, <a href="https://arxiv.org/format/2411.07112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> ROCODE: Integrating Backtracking Mechanism and Program Analysis in Large Language Models for Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xue Jiang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihong Dong</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yongding Tao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huanyu Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhi Jin</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+W">Wenpin Jiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07112v1-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved impressive performance in code generation recently, offering programmers revolutionary assistance in software development. However, due to the auto-regressive nature of LLMs, they are susceptible to error accumulation during code generation. Once an error is produced, LLMs can merely continue to generate the subsequent code conditioned on it, given their… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07112v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07112v1-abstract-full" style="display: none;"> Large language models (LLMs) have achieved impressive performance in code generation recently, offering programmers revolutionary assistance in software development. However, due to the auto-regressive nature of LLMs, they are susceptible to error accumulation during code generation. Once an error is produced, LLMs can merely continue to generate the subsequent code conditioned on it, given their inability to adjust previous outputs. Existing LLM-based approaches typically consider post-revising after code generation, leading to the challenging resolution of accumulated errors and the significant wastage of resources. Ideally, LLMs should rollback and resolve the occurred error in time during code generation, rather than proceed on the basis of the error and wait for post-revising after generation. In this paper, we propose ROCODE, which integrates the backtracking mechanism and program analysis into LLMs for code generation. Specifically, we employ program analysis to perform incremental error detection during the generation process. When an error is detected, the backtracking mechanism is triggered to priming rollback strategies and constraint regeneration, thereby eliminating the error early and ensuring continued generation on the correct basis. Experiments on multiple code generation benchmarks show that ROCODE can significantly reduce the errors generated by LLMs, with a compilation pass rate of 99.1%. The test pass rate is improved by up to 23.8% compared to the best baseline approach. Compared to the post-revising baseline, the token cost is reduced by 19.3%. Moreover, our approach is model-agnostic and achieves consistent improvements across nine representative LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07112v1-abstract-full').style.display = 'none'; document.getElementById('2411.07112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICSE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06692">arXiv:2411.06692</a> <span> [<a href="https://arxiv.org/pdf/2411.06692">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Layout Control and Semantic Guidance with Attention Loss Backward for T2I Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Guandong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06692v1-abstract-short" style="display: inline;"> Controllable image generation has always been one of the core demands in image generation, aiming to create images that are both creative and logical while satisfying additional specified conditions. In the post-AIGC era, controllable generation relies on diffusion models and is accomplished by maintaining certain components or introducing inference interferences. This paper addresses key challeng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06692v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06692v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06692v1-abstract-full" style="display: none;"> Controllable image generation has always been one of the core demands in image generation, aiming to create images that are both creative and logical while satisfying additional specified conditions. In the post-AIGC era, controllable generation relies on diffusion models and is accomplished by maintaining certain components or introducing inference interferences. This paper addresses key challenges in controllable generation: 1. mismatched object attributes during generation and poor prompt-following effects; 2. inadequate completion of controllable layouts. We propose a train-free method based on attention loss backward, cleverly controlling the cross attention map. By utilizing external conditions such as prompts that can reasonably map onto the attention map, we can control image generation without any training or fine-tuning. This method addresses issues like attribute mismatch and poor prompt-following while introducing explicit layout constraints for controllable image generation. Our approach has achieved excellent practical applications in production, and we hope it can serve as an inspiring technical report in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06692v1-abstract-full').style.display = 'none'; document.getElementById('2411.06692v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06135">arXiv:2411.06135</a> <span> [<a href="https://arxiv.org/pdf/2411.06135">pdf</a>, <a href="https://arxiv.org/format/2411.06135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Online Parallel Multi-Task Relationship Learning via Alternating Direction Method of Multipliers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruiyu Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peilin Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guangxia Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiqiang Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuewei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06135v1-abstract-short" style="display: inline;"> Online multi-task learning (OMTL) enhances streaming data processing by leveraging the inherent relations among multiple tasks. It can be described as an optimization problem in which a single loss function is defined for multiple tasks. Existing gradient-descent-based methods for this problem might suffer from gradient vanishing and poor conditioning issues. Furthermore, the centralized setting h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06135v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06135v1-abstract-full" style="display: none;"> Online multi-task learning (OMTL) enhances streaming data processing by leveraging the inherent relations among multiple tasks. It can be described as an optimization problem in which a single loss function is defined for multiple tasks. Existing gradient-descent-based methods for this problem might suffer from gradient vanishing and poor conditioning issues. Furthermore, the centralized setting hinders their application to online parallel optimization, which is vital to big data analytics. Therefore, this study proposes a novel OMTL framework based on the alternating direction multiplier method (ADMM), a recent breakthrough in optimization suitable for the distributed computing environment because of its decomposable and easy-to-implement nature. The relations among multiple tasks are modeled dynamically to fit the constant changes in an online scenario. In a classical distributed computing architecture with a central server, the proposed OMTL algorithm with the ADMM optimizer outperforms SGD-based approaches in terms of accuracy and efficiency. Because the central server might become a bottleneck when the data scale grows, we further tailor the algorithm to a decentralized setting, so that each node can work by only exchanging information with local neighbors. Experimental results on a synthetic and several real-world datasets demonstrate the efficiency of our methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06135v1-abstract-full').style.display = 'none'; document.getElementById('2411.06135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accpeted by Neurocomputing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06111">arXiv:2411.06111</a> <span> [<a href="https://arxiv.org/pdf/2411.06111">pdf</a>, <a href="https://arxiv.org/format/2411.06111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Energy-efficient Hybrid Model Predictive Trajectory Planning for Autonomous Electric Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+F">Fan Ding</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xuewen Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gaoxuan Li</a>, <a href="/search/cs?searchtype=author&query=Tew%2C+H+H">Hwa Hui Tew</a>, <a href="/search/cs?searchtype=author&query=Loo%2C+J+Y">Junn Yong Loo</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+C+W">Chor Wai Tong</a>, <a href="/search/cs?searchtype=author&query=Bakibillah%2C+A+S+M">A. S. M Bakibillah</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ziyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Z">Zhiyu Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06111v1-abstract-short" style="display: inline;"> To tackle the twin challenges of limited battery life and lengthy charging durations in electric vehicles (EVs), this paper introduces an Energy-efficient Hybrid Model Predictive Planner (EHMPP), which employs an energy-saving optimization strategy. EHMPP focuses on refining the design of the motion planner to be seamlessly integrated with the existing automatic driving algorithms, without additio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06111v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06111v1-abstract-full" style="display: none;"> To tackle the twin challenges of limited battery life and lengthy charging durations in electric vehicles (EVs), this paper introduces an Energy-efficient Hybrid Model Predictive Planner (EHMPP), which employs an energy-saving optimization strategy. EHMPP focuses on refining the design of the motion planner to be seamlessly integrated with the existing automatic driving algorithms, without additional hardware. It has been validated through simulation experiments on the Prescan, CarSim, and Matlab platforms, demonstrating that it can increase passive recovery energy by 11.74\% and effectively track motor speed and acceleration at optimal power. To sum up, EHMPP not only aids in trajectory planning but also significantly boosts energy efficiency in autonomous EVs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06111v1-abstract-full').style.display = 'none'; document.getElementById('2411.06111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the IEEE International Conference on Systems, Man, and Cybernetics (SMC) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05693">arXiv:2411.05693</a> <span> [<a href="https://arxiv.org/pdf/2411.05693">pdf</a>, <a href="https://arxiv.org/format/2411.05693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> YOSO: You-Only-Sample-Once via Compressed Sensing for Graph Neural Network Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yi Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhichun Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanpeng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bingzhe Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05693v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) have become essential tools for analyzing non-Euclidean data across various domains. During training stage, sampling plays an important role in reducing latency by limiting the number of nodes processed, particularly in large-scale applications. However, as the demand for better prediction performance grows, existing sampling algorithms become increasingly complex, lea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05693v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05693v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) have become essential tools for analyzing non-Euclidean data across various domains. During training stage, sampling plays an important role in reducing latency by limiting the number of nodes processed, particularly in large-scale applications. However, as the demand for better prediction performance grows, existing sampling algorithms become increasingly complex, leading to significant overhead. To mitigate this, we propose YOSO (You-Only-Sample-Once), an algorithm designed to achieve efficient training while preserving prediction accuracy. YOSO introduces a compressed sensing (CS)-based sampling and reconstruction framework, where nodes are sampled once at input layer, followed by a lossless reconstruction at the output layer per epoch. By integrating the reconstruction process with the loss function of specific learning tasks, YOSO not only avoids costly computations in traditional compressed sensing (CS) methods, such as orthonormal basis calculations, but also ensures high-probability accuracy retention which equivalent to full node participation. Experimental results on node classification and link prediction demonstrate the effectiveness and efficiency of YOSO, reducing GNN training by an average of 75\% compared to state-of-the-art methods, while maintaining accuracy on par with top-performing baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05693v1-abstract-full').style.display = 'none'; document.getElementById('2411.05693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+G&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+G&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+G&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository