Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 476 results for author: <span class="mathjax">Zhao, R</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhao%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhao, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhao%2C+R&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhao, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14063">arXiv:2502.14063</a> <span> [<a href="https://arxiv.org/pdf/2502.14063">pdf</a>, <a href="https://arxiv.org/format/2502.14063">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PedDet: Adaptive Spectral Optimization for Multimodal Pedestrian Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yi Xu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yi Yao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yan Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zirui Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiuying Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14063v1-abstract-short" style="display: inline;"> Pedestrian detection in intelligent transportation systems has made significant progress but faces two critical challenges: (1) insufficient fusion of complementary information between visible and infrared spectra, particularly in complex scenarios, and (2) sensitivity to illumination changes, such as low-light or overexposed conditions, leading to degraded performance. To address these issues, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14063v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14063v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14063v1-abstract-full" style="display: none;"> Pedestrian detection in intelligent transportation systems has made significant progress but faces two critical challenges: (1) insufficient fusion of complementary information between visible and infrared spectra, particularly in complex scenarios, and (2) sensitivity to illumination changes, such as low-light or overexposed conditions, leading to degraded performance. To address these issues, we propose PedDet, an adaptive spectral optimization complementarity framework specifically enhanced and optimized for multispectral pedestrian detection. PedDet introduces the Multi-scale Spectral Feature Perception Module (MSFPM) to adaptively fuse visible and infrared features, enhancing robustness and flexibility in feature extraction. Additionally, the Illumination Robustness Feature Decoupling Module (IRFDM) improves detection stability under varying lighting by decoupling pedestrian and background features. We further design a contrastive alignment to enhance intermodal feature discrimination. Experiments on LLVIP and MSDS datasets demonstrate that PedDet achieves state-of-the-art performance, improving the mAP by 6.6% with superior detection accuracy even in low-light conditions, marking a significant step forward for road safety. Code will be available at https://github.com/AIGeeksGroup/PedDet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14063v1-abstract-full').style.display = 'none'; document.getElementById('2502.14063v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14051">arXiv:2502.14051</a> <span> [<a href="https://arxiv.org/pdf/2502.14051">pdf</a>, <a href="https://arxiv.org/format/2502.14051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RocketKV: Accelerating Long-Context LLM Inference via Two-Stage KV Cache Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Behnam%2C+P">Payman Behnam</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yaosheng Fu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ritchie Zhao</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+P">Po-An Tsai</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiding Yu</a>, <a href="/search/cs?searchtype=author&query=Tumanov%2C+A">Alexey Tumanov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14051v1-abstract-short" style="display: inline;"> Transformer-based Large Language Models rely critically on KV cache to efficiently handle extended contexts during the decode phase. Yet, the size of the KV cache grows proportionally with the input length, burdening both memory bandwidth and capacity as decoding progresses. To address this challenge, we present RocketKV, a training-free KV cache compression strategy designed specifically to reduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14051v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14051v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14051v1-abstract-full" style="display: none;"> Transformer-based Large Language Models rely critically on KV cache to efficiently handle extended contexts during the decode phase. Yet, the size of the KV cache grows proportionally with the input length, burdening both memory bandwidth and capacity as decoding progresses. To address this challenge, we present RocketKV, a training-free KV cache compression strategy designed specifically to reduce both memory bandwidth and capacity demand of KV cache during the decode phase. RocketKV contains two consecutive stages. In the first stage, it performs coarse-grain KV cache eviction on the input sequence tokens with SnapKV++, a method improved upon SnapKV by introducing adaptive pooling size and full compatibility with grouped-query attention. In the second stage, it adopts a hybrid attention method to conduct fine-grain top-k sparse attention, approximating the attention scores by leveraging both head and sequence dimensional reductions. Combining these two stages, RocketKV achieves significant KV cache fetching bandwidth and storage savings while maintaining comparable accuracy to full KV cache attention. We show that RocketKV provides end-to-end speedup by up to 3$\times$ as well as peak memory reduction by up to 31% in the decode phase on an NVIDIA H100 GPU compared to the full KV cache baseline, while achieving negligible accuracy loss on a variety of long-context tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14051v1-abstract-full').style.display = 'none'; document.getElementById('2502.14051v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11610">arXiv:2502.11610</a> <span> [<a href="https://arxiv.org/pdf/2502.11610">pdf</a>, <a href="https://arxiv.org/format/2502.11610">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Accuracy Assessment of OpenAlex and Clarivate Scholar ID with an LLM-Assisted Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Renyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunxin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11610v1-abstract-short" style="display: inline;"> In quantitative SciSci (science of science) studies, accurately identifying individual scholars is paramount for scientific data analysis. However, the variability in how names are represented-due to commonality, abbreviations, and different spelling conventions-complicates this task. While identifier systems like ORCID are being developed, many scholars remain unregistered, and numerous publicati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11610v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11610v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11610v1-abstract-full" style="display: none;"> In quantitative SciSci (science of science) studies, accurately identifying individual scholars is paramount for scientific data analysis. However, the variability in how names are represented-due to commonality, abbreviations, and different spelling conventions-complicates this task. While identifier systems like ORCID are being developed, many scholars remain unregistered, and numerous publications are not included. Scholarly databases such as Clarivate and OpenAlex have introduced their own ID systems as preliminary name disambiguation solutions. This study evaluates the effectiveness of these systems across different groups to determine their suitability for various application scenarios. We sampled authors from the top quartile (Q1) of Web of Science (WOS) journals based on country, discipline, and number of corresponding author papers. For each group, we selected 100 scholars and meticulously annotated all their papers using a Search-enhanced Large Language Model method. Using these annotations, we identified the corresponding IDs in OpenAlex and Clarivate, extracted all associated papers, filtered for Q1 WOS journals, and calculated precision and recall by comparing against the annotated dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11610v1-abstract-full').style.display = 'none'; document.getElementById('2502.11610v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01857">arXiv:2502.01857</a> <span> [<a href="https://arxiv.org/pdf/2502.01857">pdf</a>, <a href="https://arxiv.org/format/2502.01857">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning Human Perception Dynamics for Informative Robot Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shenghui Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruihan Zhao</a>, <a href="/search/cs?searchtype=author&query=Chinchali%2C+S">Sandeep Chinchali</a>, <a href="/search/cs?searchtype=author&query=Topcu%2C+U">Ufuk Topcu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01857v1-abstract-short" style="display: inline;"> Human-robot cooperative navigation is challenging in environments with incomplete information. We introduce CoNav-Maze, a simulated robotics environment where a robot navigates using local perception while a human operator provides guidance based on an inaccurate map. The robot can share its camera views to improve the operator's understanding of the environment. To enable efficient human-robot co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01857v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01857v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01857v1-abstract-full" style="display: none;"> Human-robot cooperative navigation is challenging in environments with incomplete information. We introduce CoNav-Maze, a simulated robotics environment where a robot navigates using local perception while a human operator provides guidance based on an inaccurate map. The robot can share its camera views to improve the operator's understanding of the environment. To enable efficient human-robot cooperation, we propose Information Gain Monte Carlo Tree Search (IG-MCTS), an online planning algorithm that balances autonomous movement and informative communication. Central to IG-MCTS is a neural human perception dynamics model that estimates how humans distill information from robot communications. We collect a dataset through a crowdsourced mapping task in CoNav-Maze and train this model using a fully convolutional architecture with data augmentation. User studies show that IG-MCTS outperforms teleoperation and instruction-following baselines, achieving comparable task performance with significantly less communication and lower human cognitive load, as evidenced by eye-tracking metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01857v1-abstract-full').style.display = 'none'; document.getElementById('2502.01857v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01317">arXiv:2502.01317</a> <span> [<a href="https://arxiv.org/pdf/2502.01317">pdf</a>, <a href="https://arxiv.org/format/2502.01317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> DietGlance: Dietary Monitoring and Personalized Analysis at a Glance with Knowledge-Empowered AI Assistant </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhihan Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Running Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Lin Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yue Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Handi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuhai Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifang Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaojuan Ma</a>, <a href="/search/cs?searchtype=author&query=Ngai%2C+E+C+H">Edith C. H. Ngai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01317v1-abstract-short" style="display: inline;"> Growing awareness of wellness has prompted people to consider whether their dietary patterns align with their health and fitness goals. In response, researchers have introduced various wearable dietary monitoring systems and dietary assessment approaches. However, these solutions are either limited to identifying foods with simple ingredients or insufficient in providing analysis of individual die… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01317v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01317v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01317v1-abstract-full" style="display: none;"> Growing awareness of wellness has prompted people to consider whether their dietary patterns align with their health and fitness goals. In response, researchers have introduced various wearable dietary monitoring systems and dietary assessment approaches. However, these solutions are either limited to identifying foods with simple ingredients or insufficient in providing analysis of individual dietary behaviors with domain-specific knowledge. In this paper, we present DietGlance, a system that automatically monitors dietary in daily routines and delivers personalized analysis from knowledge sources. DietGlance first detects ingestive episodes from multimodal inputs using eyeglasses, capturing privacy-preserving meal images of various dishes being consumed. Based on the inferred food items and consumed quantities from these images, DietGlance further provides nutritional analysis and personalized dietary suggestions, empowered by the retrieval augmentation generation module on a reliable nutrition library. A short-term user study (N=33) and a four-week longitudinal study (N=16) demonstrate the usability and effectiveness of DietGlance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01317v1-abstract-full').style.display = 'none'; document.getElementById('2502.01317v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01002">arXiv:2502.01002</a> <span> [<a href="https://arxiv.org/pdf/2502.01002">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Resolution SAR and Optical Remote Sensing Image Registration Methods: A Review, Datasets, and Future Perspectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenfei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruipeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yongxiang Yao</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Y">Yi Wan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Peihao Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiayuan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yansheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01002v1-abstract-short" style="display: inline;"> Synthetic Aperture Radar (SAR) and optical image registration is essential for remote sensing data fusion, with applications in military reconnaissance, environmental monitoring, and disaster management. However, challenges arise from differences in imaging mechanisms, geometric distortions, and radiometric properties between SAR and optical images. As image resolution increases, fine SAR textures… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01002v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01002v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01002v1-abstract-full" style="display: none;"> Synthetic Aperture Radar (SAR) and optical image registration is essential for remote sensing data fusion, with applications in military reconnaissance, environmental monitoring, and disaster management. However, challenges arise from differences in imaging mechanisms, geometric distortions, and radiometric properties between SAR and optical images. As image resolution increases, fine SAR textures become more significant, leading to alignment issues and 3D spatial discrepancies. Two major gaps exist: the lack of a publicly available multi-resolution, multi-scene registration dataset and the absence of systematic analysis of current methods. To address this, the MultiResSAR dataset was created, containing over 10k pairs of multi-source, multi-resolution, and multi-scene SAR and optical images. Sixteen state-of-the-art algorithms were tested. Results show no algorithm achieves 100% success, and performance decreases as resolution increases, with most failing on sub-meter data. XoFTR performs best among deep learning methods (40.58%), while RIFT performs best among traditional methods (66.51%). Future research should focus on noise suppression, 3D geometric fusion, cross-view transformation modeling, and deep learning optimization for robust registration of high-resolution SAR and optical images. The dataset is available at https://github.com/betterlll/Multi-Resolution-SAR-dataset-. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01002v1-abstract-full').style.display = 'none'; document.getElementById('2502.01002v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">48 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17343">arXiv:2501.17343</a> <span> [<a href="https://arxiv.org/pdf/2501.17343">pdf</a>, <a href="https://arxiv.org/format/2501.17343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Post-Training Quantization for 3D Medical Image Segmentation: A Practical Study on Real Inference Engines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+C">Chongyu Qu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ritchie Zhao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Ye Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bin Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Junchao Zhu</a>, <a href="/search/cs?searchtype=author&query=Landman%2C+B+A">Bennett A. Landman</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17343v1-abstract-short" style="display: inline;"> Quantizing deep neural networks ,reducing the precision (bit-width) of their computations, can remarkably decrease memory usage and accelerate processing, making these models more suitable for large-scale medical imaging applications with limited computational resources. However, many existing methods studied "fake quantization", which simulates lower precision operations during inference, but doe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17343v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17343v1-abstract-full" style="display: none;"> Quantizing deep neural networks ,reducing the precision (bit-width) of their computations, can remarkably decrease memory usage and accelerate processing, making these models more suitable for large-scale medical imaging applications with limited computational resources. However, many existing methods studied "fake quantization", which simulates lower precision operations during inference, but does not actually reduce model size or improve real-world inference speed. Moreover, the potential of deploying real 3D low-bit quantization on modern GPUs is still unexplored. In this study, we introduce a real post-training quantization (PTQ) framework that successfully implements true 8-bit quantization on state-of-the-art (SOTA) 3D medical segmentation models, i.e., U-Net, SegResNet, SwinUNETR, nnU-Net, UNesT, TransUNet, ST-UNet,and VISTA3D. Our approach involves two main steps. First, we use TensorRT to perform fake quantization for both weights and activations with unlabeled calibration dataset. Second, we convert this fake quantization into real quantization via TensorRT engine on real GPUs, resulting in real-world reductions in model size and inference latency. Extensive experiments demonstrate that our framework effectively performs 8-bit quantization on GPUs without sacrificing model performance. This advancement enables the deployment of efficient deep learning models in medical imaging applications where computational resources are constrained. The code and models have been released, including U-Net, TransUNet pretrained on the BTCV dataset for abdominal (13-label) segmentation, UNesT pretrained on the Whole Brain Dataset for whole brain (133-label) segmentation, and nnU-Net, SegResNet, SwinUNETR and VISTA3D pretrained on TotalSegmentator V2 for full body (104-label) segmentation. https://github.com/hrlblab/PTQ. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17343v1-abstract-full').style.display = 'none'; document.getElementById('2501.17343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14794">arXiv:2501.14794</a> <span> [<a href="https://arxiv.org/pdf/2501.14794">pdf</a>, <a href="https://arxiv.org/format/2501.14794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HeteroLLM: Accelerating Large Language Model Inference on Mobile SoCs platform with Heterogeneous AI Accelerators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Le Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+D">Dahu Feng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+E">Erhu Feng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rong Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingrui Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yubin Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haibo Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Pinjie Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14794v1-abstract-short" style="display: inline;"> With the rapid advancement of artificial intelligence technologies such as ChatGPT, AI agents and video generation,contemporary mobile systems have begun integrating these AI capabilities on local devices to enhance privacy and reduce response latency. To meet the computational demands of AI tasks, current mobile SoCs are equipped with diverse AI accelerators, including GPUs and Neural Processing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14794v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14794v1-abstract-full" style="display: none;"> With the rapid advancement of artificial intelligence technologies such as ChatGPT, AI agents and video generation,contemporary mobile systems have begun integrating these AI capabilities on local devices to enhance privacy and reduce response latency. To meet the computational demands of AI tasks, current mobile SoCs are equipped with diverse AI accelerators, including GPUs and Neural Processing Units (NPUs). However, there has not been a comprehensive characterization of these heterogeneous processors, and existing designs typically only leverage a single AI accelerator for LLM inference, leading to suboptimal use of computational resources and memory bandwidth. In this paper, we first summarize key performance characteristics of mobile SoC, including heterogeneous processors, unified memory, synchronization, etc. Drawing on these observations, we propose different tensor partition strategies to fulfill the distinct requirements of the prefill and decoding phases. We further design a fast synchronization mechanism that leverages the unified memory address provided by mobile SoCs. By employing these techniques, we present HeteroLLM, the fastest LLM inference engine in mobile devices which supports both layer-level and tensor-level heterogeneous execution. Evaluation results show that HeteroLLM achieves 9.99 and 4.36 performance improvement over other mobile-side LLM inference engines: MLC and MNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14794v1-abstract-full').style.display = 'none'; document.getElementById('2501.14794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12935">arXiv:2501.12935</a> <span> [<a href="https://arxiv.org/pdf/2501.12935">pdf</a>, <a href="https://arxiv.org/format/2501.12935">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D Object Manipulation in a Single Image using Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruisi Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zechuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongxin Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12935v1-abstract-short" style="display: inline;"> Object manipulation in images aims to not only edit the object's presentation but also gift objects with motion. Previous methods encountered challenges in concurrently handling static editing and dynamic generation, while also struggling to achieve fidelity in object appearance and scene lighting. In this work, we introduce \textbf{OMG3D}, a novel framework that integrates the precise geometric c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12935v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12935v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12935v1-abstract-full" style="display: none;"> Object manipulation in images aims to not only edit the object's presentation but also gift objects with motion. Previous methods encountered challenges in concurrently handling static editing and dynamic generation, while also struggling to achieve fidelity in object appearance and scene lighting. In this work, we introduce \textbf{OMG3D}, a novel framework that integrates the precise geometric control with the generative power of diffusion models, thus achieving significant enhancements in visual performance. Our framework first converts 2D objects into 3D, enabling user-directed modifications and lifelike motions at the geometric level. To address texture realism, we propose CustomRefiner, a texture refinement module that pre-train a customized diffusion model, aligning the details and style of coarse renderings of 3D rough model with the original image, further refine the texture. Additionally, we introduce IllumiCombiner, a lighting processing module that estimates and corrects background lighting to match human visual perception, resulting in more realistic shadow effects. Extensive experiments demonstrate the outstanding visual performance of our approach in both static and dynamic scenarios. Remarkably, all these steps can be done using one NVIDIA 3090. Project page is at https://whalesong-zrs.github.io/OMG3D-projectpage/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12935v1-abstract-full').style.display = 'none'; document.getElementById('2501.12935v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10408">arXiv:2501.10408</a> <span> [<a href="https://arxiv.org/pdf/2501.10408">pdf</a>, <a href="https://arxiv.org/format/2501.10408">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Cross-Attention Transformer and Multi-Feature Fusion for Cross-Linguistic Speech Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruoyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiantao Jiang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">F. Richard Yu</a>, <a href="/search/cs?searchtype=author&query=Leung%2C+V+C+M">Victor C. M. Leung</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaohu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10408v1-abstract-short" style="display: inline;"> Speech Emotion Recognition (SER) plays a crucial role in enhancing human-computer interaction. Cross-Linguistic SER (CLSER) has been a challenging research problem due to significant variability in linguistic and acoustic features of different languages. In this study, we propose a novel approach HuMP-CAT, which combines HuBERT, MFCC, and prosodic characteristics. These features are fused using a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10408v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10408v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10408v1-abstract-full" style="display: none;"> Speech Emotion Recognition (SER) plays a crucial role in enhancing human-computer interaction. Cross-Linguistic SER (CLSER) has been a challenging research problem due to significant variability in linguistic and acoustic features of different languages. In this study, we propose a novel approach HuMP-CAT, which combines HuBERT, MFCC, and prosodic characteristics. These features are fused using a cross-attention transformer (CAT) mechanism during feature extraction. Transfer learning is applied to gain from a source emotional speech dataset to the target corpus for emotion recognition. We use IEMOCAP as the source dataset to train the source model and evaluate the proposed method on seven datasets in five languages (e.g., English, German, Spanish, Italian, and Chinese). We show that, by fine-tuning the source model with a small portion of speech from the target datasets, HuMP-CAT achieves an average accuracy of 78.75% across the seven datasets, with notable performance of 88.69% on EMODB (German language) and 79.48% on EMOVO (Italian language). Our extensive evaluation demonstrates that HuMP-CAT outperforms existing methods across multiple target languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10408v1-abstract-full').style.display = 'none'; document.getElementById('2501.10408v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07111">arXiv:2501.07111</a> <span> [<a href="https://arxiv.org/pdf/2501.07111">pdf</a>, <a href="https://arxiv.org/format/2501.07111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> ListConRanker: A Contrastive Text Reranker with Listwise Encoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junlong Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yue Ma</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruihui Zhao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Junhao Zheng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qianli Ma</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yangyang Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07111v1-abstract-short" style="display: inline;"> Reranker models aim to re-rank the passages based on the semantics similarity between the given query and passages, which have recently received more attention due to the wide application of the Retrieval-Augmented Generation. Most previous methods apply pointwise encoding, meaning that it can only encode the context of the query for each passage input into the model. However, for the reranker mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07111v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07111v1-abstract-full" style="display: none;"> Reranker models aim to re-rank the passages based on the semantics similarity between the given query and passages, which have recently received more attention due to the wide application of the Retrieval-Augmented Generation. Most previous methods apply pointwise encoding, meaning that it can only encode the context of the query for each passage input into the model. However, for the reranker model, given a query, the comparison results between passages are even more important, which is called listwise encoding. Besides, previous models are trained using the cross-entropy loss function, which leads to issues of unsmooth gradient changes during training and low training efficiency. To address these issues, we propose a novel Listwise-encoded Contrastive text reRanker (ListConRanker). It can help the passage to be compared with other passages during the encoding process, and enhance the contrastive information between positive examples and between positive and negative examples. At the same time, we use the circle loss to train the model to increase the flexibility of gradients and solve the problem of training efficiency. Experimental results show that ListConRanker achieves state-of-the-art performance on the reranking benchmark of Chinese Massive Text Embedding Benchmark, including the cMedQA1.0, cMedQA2.0, MMarcoReranking, and T2Reranking datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07111v1-abstract-full').style.display = 'none'; document.getElementById('2501.07111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03495">arXiv:2501.03495</a> <span> [<a href="https://arxiv.org/pdf/2501.03495">pdf</a>, <a href="https://arxiv.org/format/2501.03495">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Textualize Visual Prompt for Image Editing via Diffusion Bridge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+P">Pengcheng Xu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Q">Qingnan Fan</a>, <a href="/search/cs?searchtype=author&query=Kou%2C+F">Fei Kou</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Shuai Qin</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hong Gu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruoyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+C">Charles Ling</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03495v2-abstract-short" style="display: inline;"> Visual prompt, a pair of before-and-after edited images, can convey indescribable imagery transformations and prosper in image editing. However, current visual prompt methods rely on a pretrained text-guided image-to-image generative model that requires a triplet of text, before, and after images for retraining over a text-to-image model. Such crafting triplets and retraining processes limit the s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03495v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03495v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03495v2-abstract-full" style="display: none;"> Visual prompt, a pair of before-and-after edited images, can convey indescribable imagery transformations and prosper in image editing. However, current visual prompt methods rely on a pretrained text-guided image-to-image generative model that requires a triplet of text, before, and after images for retraining over a text-to-image model. Such crafting triplets and retraining processes limit the scalability and generalization of editing. In this paper, we present a framework based on any single text-to-image model without reliance on the explicit image-to-image model thus enhancing the generalizability and scalability. Specifically, by leveraging the probability-flow ordinary equation, we construct a diffusion bridge to transfer the distribution between before-and-after images under the text guidance. By optimizing the text via the bridge, the framework adaptively textualizes the editing transformation conveyed by visual prompts into text embeddings without other models. Meanwhile, we introduce differential attention control during text optimization, which disentangles the text embedding from the invariance of the before-and-after images and makes it solely capture the delicate transformation and generalize to edit various images. Experiments on real images validate competitive results on the generalization, contextual coherence, and high fidelity for delicate editing with just one image pair as the visual prompt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03495v2-abstract-full').style.display = 'none'; document.getElementById('2501.03495v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00895">arXiv:2501.00895</a> <span> [<a href="https://arxiv.org/pdf/2501.00895">pdf</a>, <a href="https://arxiv.org/format/2501.00895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Text2Earth: Unlocking Text-driven Remote Sensing Image Generation with a Global-Scale Dataset and a Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenyang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Keyan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Z">Zhengxia Zou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenwei Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00895v1-abstract-short" style="display: inline;"> Generative foundation models have advanced large-scale text-driven natural image generation, becoming a prominent research trend across various vertical domains. However, in the remote sensing field, there is still a lack of research on large-scale text-to-image (text2image) generation technology. Existing remote sensing image-text datasets are small in scale and confined to specific geographic ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00895v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00895v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00895v1-abstract-full" style="display: none;"> Generative foundation models have advanced large-scale text-driven natural image generation, becoming a prominent research trend across various vertical domains. However, in the remote sensing field, there is still a lack of research on large-scale text-to-image (text2image) generation technology. Existing remote sensing image-text datasets are small in scale and confined to specific geographic areas and scene types. Besides, existing text2image methods have struggled to achieve global-scale, multi-resolution controllable, and unbounded image generation. To address these challenges, this paper presents two key contributions: the Git-10M dataset and the Text2Earth foundation model. Git-10M is a global-scale image-text dataset comprising 10 million image-text pairs, 5 times larger than the previous largest one. The dataset covers a wide range of geographic scenes and contains resolution information, significantly surpassing existing datasets in both size and diversity. Building on Git-10M, we propose Text2Earth, a 1.3 billion parameter generative foundation model based on the diffusion framework to model global-scale remote sensing scenes. Text2Earth integrates a resolution guidance mechanism, enabling users to specify image resolutions. A dynamic condition adaptation strategy is proposed for training and inference to improve image quality. Text2Earth excels in zero-shot text2image generation and demonstrates robust generalization and flexibility across multiple tasks, including unbounded scene construction, image editing, and cross-modal image generation. This robust capability surpasses previous models restricted to the basic fixed size and limited scene types. On the previous benchmark dataset, Text2Earth outperforms previous models with an improvement of +26.23 FID and +20.95% Zero-shot Cls-OA metric.Our project page is \url{https://chen-yang-liu.github.io/Text2Earth} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00895v1-abstract-full').style.display = 'none'; document.getElementById('2501.00895v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18287">arXiv:2412.18287</a> <span> [<a href="https://arxiv.org/pdf/2412.18287">pdf</a>, <a href="https://arxiv.org/format/2412.18287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1609/aaai.v37i12.26702">10.1609/aaai.v37i12.26702 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Semi-supervised Credit Card Fraud Detection via Attribute-Driven Graph Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+S">Sheng Xiang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingzhi Zhu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">Dawei Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Enxia Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruihui Zhao</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Y">Yi Ouyang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yefeng Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18287v1-abstract-short" style="display: inline;"> Credit card fraud incurs a considerable cost for both cardholders and issuing banks. Contemporary methods apply machine learning-based classifiers to detect fraudulent behavior from labeled transaction records. But labeled data are usually a small proportion of billions of real transactions due to expensive labeling costs, which implies that they do not well exploit many natural features from unla… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18287v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18287v1-abstract-full" style="display: none;"> Credit card fraud incurs a considerable cost for both cardholders and issuing banks. Contemporary methods apply machine learning-based classifiers to detect fraudulent behavior from labeled transaction records. But labeled data are usually a small proportion of billions of real transactions due to expensive labeling costs, which implies that they do not well exploit many natural features from unlabeled data. Therefore, we propose a semi-supervised graph neural network for fraud detection. Specifically, we leverage transaction records to construct a temporal transaction graph, which is composed of temporal transactions (nodes) and interactions (edges) among them. Then we pass messages among the nodes through a Gated Temporal Attention Network (GTAN) to learn the transaction representation. We further model the fraud patterns through risk propagation among transactions. The extensive experiments are conducted on a real-world transaction dataset and two publicly available fraud detection datasets. The result shows that our proposed method, namely GTAN, outperforms other state-of-the-art baselines on three fraud detection datasets. Semi-supervised experiments demonstrate the excellent fraud detection performance of our model with only a tiny proportion of labeled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18287v1-abstract-full').style.display = 'none'; document.getElementById('2412.18287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 5 figures, AAAI 2023, code: https://github.com/AI4Risk/antifraud</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 37. No. 12. 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16744">arXiv:2412.16744</a> <span> [<a href="https://arxiv.org/pdf/2412.16744">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Business Analysis: User Attitude Evaluation and Prediction Based on Hotel User Reviews and Text Mining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruochun Zhao</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yue Hao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuechen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16744v1-abstract-short" style="display: inline;"> In the post-pandemic era, the hotel industry plays a crucial role in economic recovery, with consumer sentiment increasingly influencing market trends. This study utilizes advanced natural language processing (NLP) and the BERT model to analyze user reviews, extracting insights into customer satisfaction and guiding service improvements. By transforming reviews into feature vectors, the BERT model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16744v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16744v1-abstract-full" style="display: none;"> In the post-pandemic era, the hotel industry plays a crucial role in economic recovery, with consumer sentiment increasingly influencing market trends. This study utilizes advanced natural language processing (NLP) and the BERT model to analyze user reviews, extracting insights into customer satisfaction and guiding service improvements. By transforming reviews into feature vectors, the BERT model accurately classifies emotions, uncovering patterns of satisfaction and dissatisfaction. This approach provides valuable data for hotel management, helping them refine service offerings and improve customer experiences. From a financial perspective, understanding sentiment is vital for predicting market performance, as shifts in consumer sentiment often correlate with stock prices and overall industry performance. Additionally, the study addresses data imbalance in sentiment analysis, employing techniques like oversampling and undersampling to enhance model robustness. The results offer actionable insights not only for the hotel industry but also for financial analysts, aiding in market forecasts and investment decisions. This research highlights the potential of sentiment analysis to drive business growth, improve financial outcomes, and enhance competitive advantage in the dynamic tourism and hospitality sectors, thereby contributing to the broader economic landscape. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16744v1-abstract-full').style.display = 'none'; document.getElementById('2412.16744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16264">arXiv:2412.16264</a> <span> [<a href="https://arxiv.org/pdf/2412.16264">pdf</a>, <a href="https://arxiv.org/format/2412.16264">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Continual Learning with Strategic Selection and Forgetting for Network Intrusion Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Running Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhihan Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Handi Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yulong Ding</a>, <a href="/search/cs?searchtype=author&query=Ngai%2C+E+C+H">Edith C. H. Ngai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuang-Hua Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16264v3-abstract-short" style="display: inline;"> Intrusion Detection Systems (IDS) are crucial for safeguarding digital infrastructure. In dynamic network environments, both threat landscapes and normal operational behaviors are constantly changing, resulting in concept drift. While continuous learning mitigates the adverse effects of concept drift, insufficient attention to drift patterns and excessive preservation of outdated knowledge can sti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16264v3-abstract-full').style.display = 'inline'; document.getElementById('2412.16264v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16264v3-abstract-full" style="display: none;"> Intrusion Detection Systems (IDS) are crucial for safeguarding digital infrastructure. In dynamic network environments, both threat landscapes and normal operational behaviors are constantly changing, resulting in concept drift. While continuous learning mitigates the adverse effects of concept drift, insufficient attention to drift patterns and excessive preservation of outdated knowledge can still hinder the IDS's adaptability. In this paper, we propose SSF (Strategic Selection and Forgetting), a novel continual learning method for IDS, providing continuous model updates with a constantly refreshed memory buffer. Our approach features a strategic sample selection algorithm to select representative new samples and a strategic forgetting mechanism to drop outdated samples. The proposed strategic sample selection algorithm prioritizes new samples that cause the `drifted' pattern, enabling the model to better understand the evolving landscape. Additionally, we introduce strategic forgetting upon detecting significant drift by discarding outdated samples to free up memory, allowing the incorporation of more recent data. SSF captures evolving patterns effectively and ensures the model is aligned with the change of data patterns, significantly enhancing the IDS's adaptability to concept drift. The state-of-the-art performance of SSF on NSL-KDD and UNSW-NB15 datasets demonstrates its superior adaptability to concept drift for network intrusion detection. The code is released at https://github.com/xinchen930/SSF-Strategic-Selection-and-Forgetting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16264v3-abstract-full').style.display = 'none'; document.getElementById('2412.16264v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE International Conference on Computer Communications (INFOCOM) 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15228">arXiv:2412.15228</a> <span> [<a href="https://arxiv.org/pdf/2412.15228">pdf</a>, <a href="https://arxiv.org/format/2412.15228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Image Privacy Protection: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+W">Wenying Wen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Ziye Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yushu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tao Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xiangli Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruoyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuming Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15228v1-abstract-short" style="display: inline;"> Images serve as a crucial medium for communication, presenting information in a visually engaging format that facilitates rapid comprehension of key points. Meanwhile, during transmission and storage, they contain significant sensitive information. If not managed properly, this information may be vulnerable to exploitation for personal gain, potentially infringing on privacy rights and other legal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15228v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15228v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15228v1-abstract-full" style="display: none;"> Images serve as a crucial medium for communication, presenting information in a visually engaging format that facilitates rapid comprehension of key points. Meanwhile, during transmission and storage, they contain significant sensitive information. If not managed properly, this information may be vulnerable to exploitation for personal gain, potentially infringing on privacy rights and other legal entitlements. Consequently, researchers continue to propose some approaches for preserving image privacy and publish reviews that provide comprehensive and methodical summaries of these approaches. However, existing reviews tend to categorize either by specific scenarios, or by specific privacy objectives. This classification somewhat restricts the reader's ability to grasp a holistic view of image privacy protection and poses challenges in developing a total understanding of the subject that transcends different scenarios and privacy objectives. Instead of examining image privacy protection from a single aspect, it is more desirable to consider user needs for a comprehensive understanding. To fill this gap, we conduct a systematic review of image privacy protection approaches based on privacy protection goals. Specifically, we define the attribute known as privacy sensitive domains and use it as the core classification dimension to construct a comprehensive framework for image privacy protection that encompasses various scenarios and privacy objectives. This framework offers a deep understanding of the multi-layered aspects of image privacy, categorizing its protection into three primary levels: data-level, content-level, and feature-level. For each category, we analyze the main approaches and features of image privacy protection and systematically review representative solutions. Finally, we discuss the challenges and future directions of image privacy protection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15228v1-abstract-full').style.display = 'none'; document.getElementById('2412.15228v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12919">arXiv:2412.12919</a> <span> [<a href="https://arxiv.org/pdf/2412.12919">pdf</a>, <a href="https://arxiv.org/format/2412.12919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 4DRGS: 4D Radiative Gaussian Splatting for Efficient 3D Vessel Reconstruction from Sparse-View Dynamic DSA Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhentao Liu</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+R">Ruyi Zha</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huangxuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongdong Li</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhiming Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12919v1-abstract-short" style="display: inline;"> Reconstructing 3D vessel structures from sparse-view dynamic digital subtraction angiography (DSA) images enables accurate medical assessment while reducing radiation exposure. Existing methods often produce suboptimal results or require excessive computation time. In this work, we propose 4D radiative Gaussian splatting (4DRGS) to achieve high-quality reconstruction efficiently. In detail, we rep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12919v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12919v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12919v1-abstract-full" style="display: none;"> Reconstructing 3D vessel structures from sparse-view dynamic digital subtraction angiography (DSA) images enables accurate medical assessment while reducing radiation exposure. Existing methods often produce suboptimal results or require excessive computation time. In this work, we propose 4D radiative Gaussian splatting (4DRGS) to achieve high-quality reconstruction efficiently. In detail, we represent the vessels with 4D radiative Gaussian kernels. Each kernel has time-invariant geometry parameters, including position, rotation, and scale, to model static vessel structures. The time-dependent central attenuation of each kernel is predicted from a compact neural network to capture the temporal varying response of contrast agent flow. We splat these Gaussian kernels to synthesize DSA images via X-ray rasterization and optimize the model with real captured ones. The final 3D vessel volume is voxelized from the well-trained kernels. Moreover, we introduce accumulated attenuation pruning and bounded scaling activation to improve reconstruction quality. Extensive experiments on real-world patient data demonstrate that 4DRGS achieves impressive results in 5 minutes training, which is 32x faster than the state-of-the-art method. This underscores the potential of 4DRGS for real-world clinics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12919v1-abstract-full').style.display = 'none'; document.getElementById('2412.12919v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Zhentao Liu and Ruyi Zha made equal contributions</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11483">arXiv:2412.11483</a> <span> [<a href="https://arxiv.org/pdf/2412.11483">pdf</a>, <a href="https://arxiv.org/format/2412.11483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> "They've Stolen My GPL-Licensed Model!": Toward Standardized and Transparent Model Licensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+M">Moming Duan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Linshan Jiang</a>, <a href="/search/cs?searchtype=author&query=Shadbolt%2C+N">Nigel Shadbolt</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bingsheng He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11483v1-abstract-short" style="display: inline;"> As model parameter sizes reach the billion-level range and their training consumes zettaFLOPs of computation, components reuse and collaborative development are become increasingly prevalent in the Machine Learning (ML) community. These components, including models, software, and datasets, may originate from various sources and be published under different licenses, which govern the use and distri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11483v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11483v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11483v1-abstract-full" style="display: none;"> As model parameter sizes reach the billion-level range and their training consumes zettaFLOPs of computation, components reuse and collaborative development are become increasingly prevalent in the Machine Learning (ML) community. These components, including models, software, and datasets, may originate from various sources and be published under different licenses, which govern the use and distribution of licensed works and their derivatives. However, commonly chosen licenses, such as GPL and Apache, are software-specific and are not clearly defined or bounded in the context of model publishing. Meanwhile, the reused components may also have free-content licenses and model licenses, which pose a potential risk of license noncompliance and rights infringement within the model production workflow. In this paper, we propose addressing the above challenges along two lines: 1) For license analysis, we have developed a new vocabulary for ML workflow management and encoded license rules to enable ontological reasoning for analyzing rights granting and compliance issues. 2) For standardized model publishing, we have drafted a set of model licenses that provide flexible options to meet the diverse needs of model publishing. Our analysis tool is built on Turtle language and Notation3 reasoning engine, envisioned as a first step toward Linked Open Model Production Data. We have also encoded our proposed model licenses into rules and demonstrated the effects of GPL and other commonly used licenses in model publishing, along with the flexibility advantages of our licenses, through comparisons and experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11483v1-abstract-full').style.display = 'none'; document.getElementById('2412.11483v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures. Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10834">arXiv:2412.10834</a> <span> [<a href="https://arxiv.org/pdf/2412.10834">pdf</a>, <a href="https://arxiv.org/format/2412.10834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SegACIL: Solving the Stability-Plasticity Dilemma in Class-Incremental Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaxu Li</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+S">Songning Lai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+D">Di Fang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+K">Kejia Fan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jianheng Tang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuhan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rongchang Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Dongzhan Zhou</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yutao Yue</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+H">Huiping Zhuang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10834v1-abstract-short" style="display: inline;"> While deep learning has made remarkable progress in recent years, models continue to struggle with catastrophic forgetting when processing continuously incoming data. This issue is particularly critical in continual learning, where the balance between retaining prior knowledge and adapting to new information-known as the stability-plasticity dilemma-remains a significant challenge. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10834v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10834v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10834v1-abstract-full" style="display: none;"> While deep learning has made remarkable progress in recent years, models continue to struggle with catastrophic forgetting when processing continuously incoming data. This issue is particularly critical in continual learning, where the balance between retaining prior knowledge and adapting to new information-known as the stability-plasticity dilemma-remains a significant challenge. In this paper, we propose SegACIL, a novel continual learning method for semantic segmentation based on a linear closed-form solution. Unlike traditional methods that require multiple epochs for training, SegACIL only requires a single epoch, significantly reducing computational costs. Furthermore, we provide a theoretical analysis demonstrating that SegACIL achieves performance on par with joint learning, effectively retaining knowledge from previous data which makes it to keep both stability and plasticity at the same time. Extensive experiments on the Pascal VOC2012 dataset show that SegACIL achieves superior performance in the sequential, disjoint, and overlap settings, offering a robust solution to the challenges of class-incremental semantic segmentation. Code is available at https://github.com/qwrawq/SegACIL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10834v1-abstract-full').style.display = 'none'; document.getElementById('2412.10834v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10040">arXiv:2412.10040</a> <span> [<a href="https://arxiv.org/pdf/2412.10040">pdf</a>, <a href="https://arxiv.org/format/2412.10040">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RemDet: Rethinking Efficient Model Design for UAV Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Huiying Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xinzhong Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10040v2-abstract-short" style="display: inline;"> Object detection in Unmanned Aerial Vehicle (UAV) images has emerged as a focal area of research, which presents two significant challenges: i) objects are typically small and dense within vast images; ii) computational resource constraints render most models unsuitable for real-time deployment. Current real-time object detectors are not optimized for UAV images, and complex methods designed for s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10040v2-abstract-full').style.display = 'inline'; document.getElementById('2412.10040v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10040v2-abstract-full" style="display: none;"> Object detection in Unmanned Aerial Vehicle (UAV) images has emerged as a focal area of research, which presents two significant challenges: i) objects are typically small and dense within vast images; ii) computational resource constraints render most models unsuitable for real-time deployment. Current real-time object detectors are not optimized for UAV images, and complex methods designed for small object detection often lack real-time capabilities. To address these challenges, we propose a novel detector, RemDet (Reparameter efficient multiplication Detector). Our contributions are as follows: 1) Rethinking the challenges of existing detectors for small and dense UAV images, and proposing information loss as a design guideline for efficient models. 2) We introduce the ChannelC2f module to enhance small object detection performance, demonstrating that high-dimensional representations can effectively mitigate information loss. 3) We design the GatedFFN module to provide not only strong performance but also low latency, effectively addressing the challenges of real-time detection. Our research reveals that GatedFFN, through the use of multiplication, is more cost-effective than feed-forward networks for high-dimensional representation. 4) We propose the CED module, which combines the advantages of ViT and CNN downsampling to effectively reduce information loss. It specifically enhances context information for small and dense objects. Extensive experiments on large UAV datasets, Visdrone and UAVDT, validate the real-time efficiency and superior performance of our methods. On the challenging UAV dataset VisDrone, our methods not only provided state-of-the-art results, improving detection by more than 3.4%, but also achieve 110 FPS on a single 4090. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10040v2-abstract-full').style.display = 'none'; document.getElementById('2412.10040v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06936">arXiv:2412.06936</a> <span> [<a href="https://arxiv.org/pdf/2412.06936">pdf</a>, <a href="https://arxiv.org/format/2412.06936">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Creating a Cooperative AI Policymaking Platform through Open Source Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lewington%2C+A">Aiden Lewington</a>, <a href="/search/cs?searchtype=author&query=Vittalam%2C+A">Alekhya Vittalam</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Anshumaan Singh</a>, <a href="/search/cs?searchtype=author&query=Uppuluri%2C+A">Anuja Uppuluri</a>, <a href="/search/cs?searchtype=author&query=Ashok%2C+A">Arjun Ashok</a>, <a href="/search/cs?searchtype=author&query=Athmaram%2C+A+M">Ashrith Mandayam Athmaram</a>, <a href="/search/cs?searchtype=author&query=Milt%2C+A">Austin Milt</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+B">Benjamin Smith</a>, <a href="/search/cs?searchtype=author&query=Weinberger%2C+C">Charlie Weinberger</a>, <a href="/search/cs?searchtype=author&query=Sarin%2C+C">Chatanya Sarin</a>, <a href="/search/cs?searchtype=author&query=Bergmeir%2C+C">Christoph Bergmeir</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Cliff Chang</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+D">Daivik Patel</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Daniel Li</a>, <a href="/search/cs?searchtype=author&query=Bell%2C+D">David Bell</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+D">Defu Cao</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+D">Donghwa Shin</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+E">Edward Kang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+E">Edwin Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Enhui Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Felix Chen</a>, <a href="/search/cs?searchtype=author&query=Smithline%2C+G">Gabe Smithline</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Gasztowtt%2C+H">Henry Gasztowtt</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+H">Hoon Shin</a> , et al. (26 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06936v1-abstract-short" style="display: inline;"> Advances in artificial intelligence (AI) present significant risks and opportunities, requiring improved governance to mitigate societal harms and promote equitable benefits. Current incentive structures and regulatory delays may hinder responsible AI development and deployment, particularly in light of the transformative potential of large language models (LLMs). To address these challenges, we p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06936v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06936v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06936v1-abstract-full" style="display: none;"> Advances in artificial intelligence (AI) present significant risks and opportunities, requiring improved governance to mitigate societal harms and promote equitable benefits. Current incentive structures and regulatory delays may hinder responsible AI development and deployment, particularly in light of the transformative potential of large language models (LLMs). To address these challenges, we propose developing the following three contributions: (1) a large multimodal text and economic-timeseries foundation model that integrates economic and natural language policy data for enhanced forecasting and decision-making, (2) algorithmic mechanisms for eliciting diverse and representative perspectives, enabling the creation of data-driven public policy recommendations, and (3) an AI-driven web platform for supporting transparent, inclusive, and data-driven policymaking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06936v1-abstract-full').style.display = 'none'; document.getElementById('2412.06936v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06599">arXiv:2412.06599</a> <span> [<a href="https://arxiv.org/pdf/2412.06599">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> A No-Reference Medical Image Quality Assessment Method Based on Automated Distortion Recognition Technology: Application to Preprocessing in MRI-guided Radiotherapy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zilin Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengqi Chen</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jianrong Dai</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Shirui Qin</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Ying Cao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruiao Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guohua Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yuan Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiayun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06599v2-abstract-short" style="display: inline;"> Objective:To develop a no-reference image quality assessment method using automated distortion recognition to boost MRI-guided radiotherapy precision.Methods:We analyzed 106,000 MR images from 10 patients with liver metastasis,captured with the Elekta Unity MR-LINAC.Our No-Reference Quality Assessment Model includes:1)image preprocessing to enhance visibility of key diagnostic features;2)feature e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06599v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06599v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06599v2-abstract-full" style="display: none;"> Objective:To develop a no-reference image quality assessment method using automated distortion recognition to boost MRI-guided radiotherapy precision.Methods:We analyzed 106,000 MR images from 10 patients with liver metastasis,captured with the Elekta Unity MR-LINAC.Our No-Reference Quality Assessment Model includes:1)image preprocessing to enhance visibility of key diagnostic features;2)feature extraction and directional analysis using MSCN coefficients across four directions to capture textural attributes and gradients,vital for identifying image features and potential distortions;3)integrative Quality Index(QI)calculation,which integrates features via AGGD parameter estimation and K-means clustering.The QI,based on a weighted MAD computation of directional scores,provides a comprehensive image quality measure,robust against outliers.LOO-CV assessed model generalizability and performance.Tumor tracking algorithm performance was compared with and without preprocessing to verify tracking accuracy enhancements.Results:Preprocessing significantly improved image quality,with the QI showing substantial positive changes and surpassing other metrics.After normalization,the QI's average value was 79.6 times higher than CNR,indicating improved image definition and contrast.It also showed higher sensitivity in detail recognition with average values 6.5 times and 1.7 times higher than Tenengrad gradient and entropy.The tumor tracking algorithm confirmed significant tracking accuracy improvements with preprocessed images,validating preprocessing effectiveness.Conclusions:This study introduces a novel no-reference image quality evaluation method based on automated distortion recognition,offering a new quality control tool for MRIgRT tumor tracking.It enhances clinical application accuracy and facilitates medical image quality assessment standardization, with significant clinical and research value. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06599v2-abstract-full').style.display = 'none'; document.getElementById('2412.06599v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06163">arXiv:2412.06163</a> <span> [<a href="https://arxiv.org/pdf/2412.06163">pdf</a>, <a href="https://arxiv.org/format/2412.06163">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ASGDiffusion: Parallel High-Resolution Generation with Asynchronous Structure Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuming Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+P">Peidong Jia</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+D">Daiwei Hong</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yueru Jia</a>, <a href="/search/cs?searchtype=author&query=She%2C+Q">Qi She</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Ming Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06163v1-abstract-short" style="display: inline;"> Training-free high-resolution (HR) image generation has garnered significant attention due to the high costs of training large diffusion models. Most existing methods begin by reconstructing the overall structure and then proceed to refine the local details. Despite their advancements, they still face issues with repetitive patterns in HR image generation. Besides, HR generation with diffusion mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06163v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06163v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06163v1-abstract-full" style="display: none;"> Training-free high-resolution (HR) image generation has garnered significant attention due to the high costs of training large diffusion models. Most existing methods begin by reconstructing the overall structure and then proceed to refine the local details. Despite their advancements, they still face issues with repetitive patterns in HR image generation. Besides, HR generation with diffusion models incurs significant computational costs. Thus, parallel generation is essential for interactive applications. To solve the above limitations, we introduce a novel method named ASGDiffusion for parallel HR generation with Asynchronous Structure Guidance (ASG) using pre-trained diffusion models. To solve the pattern repetition problem of HR image generation, ASGDiffusion leverages the low-resolution (LR) noise weighted by the attention mask as the structure guidance for the denoising step to ensure semantic consistency. The proposed structure guidance can significantly alleviate the pattern repetition problem. To enable parallel generation, we further propose a parallelism strategy, which calculates the patch noises and structure guidance asynchronously. By leveraging multi-GPU parallel acceleration, we significantly accelerate generation speed and reduce memory usage per GPU. Extensive experiments demonstrate that our method effectively and efficiently addresses common issues like pattern repetition and achieves state-of-the-art HR generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06163v1-abstract-full').style.display = 'none'; document.getElementById('2412.06163v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04266">arXiv:2412.04266</a> <span> [<a href="https://arxiv.org/pdf/2412.04266">pdf</a>, <a href="https://arxiv.org/format/2412.04266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Representation Purification for End-to-End Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yue Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yidong Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaodong Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04266v1-abstract-short" style="display: inline;"> Speech-to-text translation (ST) is a cross-modal task that involves converting spoken language into text in a different language. Previous research primarily focused on enhancing speech translation by facilitating knowledge transfer from machine translation, exploring various methods to bridge the gap between speech and text modalities. Despite substantial progress made, factors in speech that are… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04266v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04266v1-abstract-full" style="display: none;"> Speech-to-text translation (ST) is a cross-modal task that involves converting spoken language into text in a different language. Previous research primarily focused on enhancing speech translation by facilitating knowledge transfer from machine translation, exploring various methods to bridge the gap between speech and text modalities. Despite substantial progress made, factors in speech that are not relevant to translation content, such as timbre and rhythm, often limit the efficiency of knowledge transfer. In this paper, we conceptualize speech representation as a combination of content-agnostic and content-relevant factors. We examine the impact of content-agnostic factors on translation performance through preliminary experiments and observe a significant performance deterioration when content-agnostic perturbations are introduced to speech signals. To address this issue, we propose a \textbf{S}peech \textbf{R}epresentation \textbf{P}urification with \textbf{S}upervision \textbf{E}nhancement (SRPSE) framework, which excludes the content-agnostic components within speech representations to mitigate their negative impact on ST. Experiments on MuST-C and CoVoST-2 datasets demonstrate that SRPSE significantly improves translation performance across all translation directions in three settings and achieves preeminent performance under a \textit{transcript-free} setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04266v1-abstract-full').style.display = 'none'; document.getElementById('2412.04266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by COLING 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03131">arXiv:2412.03131</a> <span> [<a href="https://arxiv.org/pdf/2412.03131">pdf</a>, <a href="https://arxiv.org/format/2412.03131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Unifying KV Cache Compression for Large Language Models with LeanKV </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuwei Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Runyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Lui%2C+J+C+S">John C. S. Lui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haibo Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03131v1-abstract-short" style="display: inline;"> Large language models (LLMs) demonstrate exceptional performance but incur high serving costs due to substantial memory demands, with the key-value (KV) cache being a primary bottleneck. Existing KV cache compression methods, including quantization and pruning, struggle with limitations such as uniform treatment of keys and values and static memory allocation across attention heads. To address the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03131v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03131v1-abstract-full" style="display: none;"> Large language models (LLMs) demonstrate exceptional performance but incur high serving costs due to substantial memory demands, with the key-value (KV) cache being a primary bottleneck. Existing KV cache compression methods, including quantization and pruning, struggle with limitations such as uniform treatment of keys and values and static memory allocation across attention heads. To address these challenges, we introduce LeanKV, a unified KV cache compression framework that enhances LLM serving efficiency without compromising accuracy through three innovations: (1) Hetero-KV quantization, which stores keys at a higher precision than values to reflect their greater impact on attention computations; (2) per-head dynamic sparsity, which allocates memory based on token importance per head and per request; and (3) unified KV compression, integrating mixed-precision quantization and selective pruning to enable a smooth tradeoff between model accuracy and memory efficiency. To efficiently support these techniques, LeanKV introduces systems optimizations including unified paging and on-GPU parallel memory management. Implemented on vLLM, LeanKV compresses the KV cache by $3.0\times$ to $5.0\times$ without accuracy loss and up to $11.0\times$ with under 5% accuracy loss, enhancing throughput by $1.9\times$ to $2.5\times$, and up to $6.9\times$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03131v1-abstract-full').style.display = 'none'; document.getElementById('2412.03131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01114">arXiv:2412.01114</a> <span> [<a href="https://arxiv.org/pdf/2412.01114">pdf</a>, <a href="https://arxiv.org/format/2412.01114">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dense Dynamics-Aware Reward Synthesis: Integrating Prior Experience with Demonstrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Koprulu%2C+C">Cevahir Koprulu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Po-han Li</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+T">Tianyu Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruihan Zhao</a>, <a href="/search/cs?searchtype=author&query=Westenbroek%2C+T">Tyler Westenbroek</a>, <a href="/search/cs?searchtype=author&query=Fridovich-Keil%2C+D">David Fridovich-Keil</a>, <a href="/search/cs?searchtype=author&query=Chinchali%2C+S">Sandeep Chinchali</a>, <a href="/search/cs?searchtype=author&query=Topcu%2C+U">Ufuk Topcu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01114v1-abstract-short" style="display: inline;"> Many continuous control problems can be formulated as sparse-reward reinforcement learning (RL) tasks. In principle, online RL methods can automatically explore the state space to solve each new task. However, discovering sequences of actions that lead to a non-zero reward becomes exponentially more difficult as the task horizon increases. Manually shaping rewards can accelerate learning for a fix… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01114v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01114v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01114v1-abstract-full" style="display: none;"> Many continuous control problems can be formulated as sparse-reward reinforcement learning (RL) tasks. In principle, online RL methods can automatically explore the state space to solve each new task. However, discovering sequences of actions that lead to a non-zero reward becomes exponentially more difficult as the task horizon increases. Manually shaping rewards can accelerate learning for a fixed task, but it is an arduous process that must be repeated for each new environment. We introduce a systematic reward-shaping framework that distills the information contained in 1) a task-agnostic prior data set and 2) a small number of task-specific expert demonstrations, and then uses these priors to synthesize dense dynamics-aware rewards for the given task. This supervision substantially accelerates learning in our experiments, and we provide analysis demonstrating how the approach can effectively guide online learning agents to faraway goals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01114v1-abstract-full').style.display = 'none'; document.getElementById('2412.01114v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17102">arXiv:2411.17102</a> <span> [<a href="https://arxiv.org/pdf/2411.17102">pdf</a>, <a href="https://arxiv.org/format/2411.17102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Scholar Name Disambiguation with Search-enhanced LLM Across Language </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Renyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunxin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17102v1-abstract-short" style="display: inline;"> The task of scholar name disambiguation is crucial in various real-world scenarios, including bibliometric-based candidate evaluation for awards, application material anti-fraud measures, and more. Despite significant advancements, current methods face limitations due to the complexity of heterogeneous data, often necessitating extensive human intervention. This paper proposes a novel approach by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17102v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17102v1-abstract-full" style="display: none;"> The task of scholar name disambiguation is crucial in various real-world scenarios, including bibliometric-based candidate evaluation for awards, application material anti-fraud measures, and more. Despite significant advancements, current methods face limitations due to the complexity of heterogeneous data, often necessitating extensive human intervention. This paper proposes a novel approach by leveraging search-enhanced language models across multiple languages to improve name disambiguation. By utilizing the powerful query rewriting, intent recognition, and data indexing capabilities of search engines, our method can gather richer information for distinguishing between entities and extracting profiles, resulting in a more comprehensive data dimension. Given the strong cross-language capabilities of large language models(LLMs), optimizing enhanced retrieval methods with this technology offers substantial potential for high-efficiency information retrieval and utilization. Our experiments demonstrate that incorporating local languages significantly enhances disambiguation performance, particularly for scholars from diverse geographic regions. This multi-lingual, search-enhanced methodology offers a promising direction for more efficient and accurate active scholar name disambiguation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17102v1-abstract-full').style.display = 'none'; document.getElementById('2411.17102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13932">arXiv:2411.13932</a> <span> [<a href="https://arxiv.org/pdf/2411.13932">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> XAgents: A Framework for Interpretable Rule-Based Multi-Agents Cooperation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hailong Yang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+M">Mingxian Gu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Renhuo Zhao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+F">Fuping Hu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhaohong Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yitang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13932v1-abstract-short" style="display: inline;"> Extracting implicit knowledge and logical reasoning abilities from large language models (LLMs) has consistently been a significant challenge. The advancement of multi-agent systems has further en-hanced the capabilities of LLMs. Inspired by the structure of multi-polar neurons (MNs), we propose the XAgents framework, an in-terpretable multi-agent cooperative framework based on the IF-THEN rule-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13932v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13932v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13932v1-abstract-full" style="display: none;"> Extracting implicit knowledge and logical reasoning abilities from large language models (LLMs) has consistently been a significant challenge. The advancement of multi-agent systems has further en-hanced the capabilities of LLMs. Inspired by the structure of multi-polar neurons (MNs), we propose the XAgents framework, an in-terpretable multi-agent cooperative framework based on the IF-THEN rule-based system. The IF-Parts of the rules are responsible for logical reasoning and domain membership calculation, while the THEN-Parts are comprised of domain expert agents that generate domain-specific contents. Following the calculation of the member-ship, XAgetns transmits the task to the disparate domain rules, which subsequently generate the various responses. These re-sponses are analogous to the answers provided by different experts to the same question. The final response is reached at by eliminat-ing the hallucinations and erroneous knowledge of the LLM through membership computation and semantic adversarial genera-tion of the various domain rules. The incorporation of rule-based interpretability serves to bolster user confidence in the XAgents framework. We evaluate the efficacy of XAgents through a com-parative analysis with the latest AutoAgents, in which XAgents demonstrated superior performance across three distinct datasets. We perform post-hoc interpretable studies with SHAP algorithm and case studies, proving the interpretability of XAgent in terms of input-output feature correlation and rule-based semantics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13932v1-abstract-full').style.display = 'none'; document.getElementById('2411.13932v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12195">arXiv:2411.12195</a> <span> [<a href="https://arxiv.org/pdf/2411.12195">pdf</a>, <a href="https://arxiv.org/format/2411.12195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Medical Vision-and-Language Applications and Their Techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruoshan Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sinuo Wang</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+V+M+H">Vu Minh Hieu Phan</a>, <a href="/search/cs?searchtype=author&query=Hengel%2C+A+v+d">Anton van den Hengel</a>, <a href="/search/cs?searchtype=author&query=Verjans%2C+J">Johan Verjans</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zhibin Liao</a>, <a href="/search/cs?searchtype=author&query=To%2C+M">Minh-Son To</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yong Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jian Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yutong Xie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12195v1-abstract-short" style="display: inline;"> Medical vision-and-language models (MVLMs) have attracted substantial interest due to their capability to offer a natural language interface for interpreting complex medical data. Their applications are versatile and have the potential to improve diagnostic accuracy and decision-making for individual patients while also contributing to enhanced public health monitoring, disease surveillance, and p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12195v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12195v1-abstract-full" style="display: none;"> Medical vision-and-language models (MVLMs) have attracted substantial interest due to their capability to offer a natural language interface for interpreting complex medical data. Their applications are versatile and have the potential to improve diagnostic accuracy and decision-making for individual patients while also contributing to enhanced public health monitoring, disease surveillance, and policy-making through more efficient analysis of large data sets. MVLMS integrate natural language processing with medical images to enable a more comprehensive and contextual understanding of medical images alongside their corresponding textual information. Unlike general vision-and-language models trained on diverse, non-specialized datasets, MVLMs are purpose-built for the medical domain, automatically extracting and interpreting critical information from medical images and textual reports to support clinical decision-making. Popular clinical applications of MVLMs include automated medical report generation, medical visual question answering, medical multimodal segmentation, diagnosis and prognosis and medical image-text retrieval. Here, we provide a comprehensive overview of MVLMs and the various medical tasks to which they have been applied. We conduct a detailed analysis of various vision-and-language model architectures, focusing on their distinct strategies for cross-modal integration/exploitation of medical visual and textual features. We also examine the datasets used for these tasks and compare the performance of different models based on standardized evaluation metrics. Furthermore, we highlight potential challenges and summarize future research trends and directions. The full collection of papers and codes is available at: https://github.com/YtongXie/Medical-Vision-and-Language-Tasks-and-Methodologies-A-Survey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12195v1-abstract-full').style.display = 'none'; document.getElementById('2411.12195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11020">arXiv:2411.11020</a> <span> [<a href="https://arxiv.org/pdf/2411.11020">pdf</a>, <a href="https://arxiv.org/format/2411.11020">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Training a Label-Noise-Resistant GNN with Reduced Complexity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Bin Shi</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhiming Liang</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+J">Jianfei Ruan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+B">Bo Dong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Lu Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11020v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have been widely employed for semi-supervised node classification tasks on graphs. However, the performance of GNNs is significantly affected by label noise, that is, a small amount of incorrectly labeled nodes can substantially misguide model training. Mainstream solutions define node classification with label noise (NCLN) as a reliable labeling task, often introducin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11020v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11020v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have been widely employed for semi-supervised node classification tasks on graphs. However, the performance of GNNs is significantly affected by label noise, that is, a small amount of incorrectly labeled nodes can substantially misguide model training. Mainstream solutions define node classification with label noise (NCLN) as a reliable labeling task, often introducing node similarity with quadratic computational complexity to more accurately assess label reliability. To this end, in this paper, we introduce the Label Ensemble Graph Neural Network (LEGNN), a lower complexity method for robust GNNs training against label noise. LEGNN reframes NCLN as a label ensemble task, gathering informative multiple labels instead of constructing a single reliable label, avoiding high-complexity computations for reliability assessment. Specifically, LEGNN conducts a two-step process: bootstrapping neighboring contexts and robust learning with gathered multiple labels. In the former step, we apply random neighbor masks for each node and gather the predicted labels as a high-probability label set. This mitigates the impact of inaccurately labeled neighbors and diversifies the label set. In the latter step, we utilize a partial label learning based strategy to aggregate the high-probability label information for model training. Additionally, we symmetrically gather a low-probability label set to counteract potential noise from the bootstrapped high-probability label set. Extensive experiments on six datasets demonstrate that LEGNN achieves outstanding performance while ensuring efficiency. Moreover, it exhibits good scalability on dataset with over one hundred thousand nodes and one million edges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11020v1-abstract-full').style.display = 'none'; document.getElementById('2411.11020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08896">arXiv:2411.08896</a> <span> [<a href="https://arxiv.org/pdf/2411.08896">pdf</a>, <a href="https://arxiv.org/format/2411.08896">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Demand-Aware Beam Hopping and Power Allocation for Load Balancing in Digital Twin empowered LEO Satellite Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruili Zhao</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jun Cai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiangtao Luo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junpeng Gao</a>, <a href="/search/cs?searchtype=author&query=Ran%2C+Y">Yongyi Ran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08896v1-abstract-short" style="display: inline;"> Low-Earth orbit (LEO) satellites utilizing beam hopping (BH) technology offer extensive coverage, low latency, high bandwidth, and significant flexibility. However, the uneven geographical distribution and temporal variability of ground traffic demands, combined with the high mobility of LEO satellites, present significant challenges for efficient beam resource utilization. Traditional BH methods… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08896v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08896v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08896v1-abstract-full" style="display: none;"> Low-Earth orbit (LEO) satellites utilizing beam hopping (BH) technology offer extensive coverage, low latency, high bandwidth, and significant flexibility. However, the uneven geographical distribution and temporal variability of ground traffic demands, combined with the high mobility of LEO satellites, present significant challenges for efficient beam resource utilization. Traditional BH methods based on GEO satellites fail to address issues such as satellite interference, overlapping coverage, and mobility. This paper explores a Digital Twin (DT)-based collaborative resource allocation network for multiple LEO satellites with overlapping coverage areas. A two-tier optimization problem, focusing on load balancing and cell service fairness, is proposed to maximize throughput and minimize inter-cell service delay. The DT layer optimizes the allocation of overlapping coverage cells by designing BH patterns for each satellite, while the LEO layer optimizes power allocation for each selected service cell. At the DT layer, an Actor-Critic network is deployed on each agent, with a global critic network in the cloud center. The A3C algorithm is employed to optimize the DT layer. Concurrently, the LEO layer optimization is performed using a Multi-Agent Reinforcement Learning algorithm, where each beam functions as an independent agent. The simulation results show that this method reduces satellite load disparity by about 72.5% and decreases the average delay to 12ms. Additionally, our approach outperforms other benchmarks in terms of throughput, ensuring a better alignment between offered and requested data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08896v1-abstract-full').style.display = 'none'; document.getElementById('2411.08896v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07503">arXiv:2411.07503</a> <span> [<a href="https://arxiv.org/pdf/2411.07503">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Tissues and Organs">q-bio.TO</span> </div> </div> <p class="title is-5 mathjax"> A Novel Automatic Real-time Motion Tracking Method for Magnetic Resonance Imaging-guided Radiotherapy: Leveraging the Enhanced Tracking-Learning-Detection Framework with Automatic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengqi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zilin Wang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jianrong Dai</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Shirui Qin</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Ying Cao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruiao Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiayun Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guohua Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yuan Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07503v2-abstract-short" style="display: inline;"> Background and Purpose: Accurate motion tracking in MRI-guided Radiotherapy (MRIgRT) is essential for effective treatment delivery. This study aimed to enhance motion tracking precision in MRIgRT through an automatic real-time markerless tracking method using an enhanced Tracking-Learning-Detection (ETLD) framework with automatic segmentation. Materials and Methods: We developed a novel MRIgRT mot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07503v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07503v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07503v2-abstract-full" style="display: none;"> Background and Purpose: Accurate motion tracking in MRI-guided Radiotherapy (MRIgRT) is essential for effective treatment delivery. This study aimed to enhance motion tracking precision in MRIgRT through an automatic real-time markerless tracking method using an enhanced Tracking-Learning-Detection (ETLD) framework with automatic segmentation. Materials and Methods: We developed a novel MRIgRT motion tracking and segmentation method by integrating the ETLD framework with an improved Chan-Vese model (ICV), named ETLD+ICV. The ETLD framework was upgraded for real-time cine MRI, including advanced image preprocessing, no-reference image quality assessment, an enhanced median-flow tracker, and a refined detector with dynamic search region adjustments. ICV was used for precise target volume coverage, refining the segmented region frame by frame using tracking results, with key parameters optimized. The method was tested on 3.5D MRI scans from 10 patients with liver metastases. Results: Evaluation of 106,000 frames across 77 treatment fractions showed sub-millimeter tracking errors of less than 0.8mm, with over 99% precision and 98% recall for all subjects in the Beam Eye View(BEV)/Beam Path View(BPV) orientation. The ETLD+ICV method achieved a dice global score of more than 82% for all subjects, demonstrating the method's extensibility and precise target volume coverage. Conclusion: This study successfully developed an automatic real-time markerless motion tracking method for MRIgRT that significantly outperforms current methods. The novel method not only delivers exceptional precision in tracking and segmentation but also shows enhanced adaptability to clinical demands, making it an indispensable asset in improving the efficacy of radiotherapy treatments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07503v2-abstract-full').style.display = 'none'; document.getElementById('2411.07503v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07446">arXiv:2411.07446</a> <span> [<a href="https://arxiv.org/pdf/2411.07446">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Accurate Prompt Optimization: the Benefit of Memory in Exemplar-Guided Reflection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+C">Cilin Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruihui Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaopu Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+K">Kai Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingsong Liu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yangyang Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07446v1-abstract-short" style="display: inline;"> Automatic prompt engineering aims to enhance the generation quality of large language models (LLMs). Recent works utilize feedbacks generated from erroneous cases to guide the prompt optimization. During inference, they may further retrieve several semantically-related exemplars and concatenate them to the optimized prompts to improve the performance. However, those works only utilize the feedback… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07446v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07446v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07446v1-abstract-full" style="display: none;"> Automatic prompt engineering aims to enhance the generation quality of large language models (LLMs). Recent works utilize feedbacks generated from erroneous cases to guide the prompt optimization. During inference, they may further retrieve several semantically-related exemplars and concatenate them to the optimized prompts to improve the performance. However, those works only utilize the feedback at the current step, ignoring historical and unseleccted feedbacks which are potentially beneficial. Moreover, the selection of exemplars only considers the general semantic relationship and may not be optimal in terms of task performance and matching with the optimized prompt. In this work, we propose an Exemplar-Guided Reflection with Memory mechanism (ERM) to realize more efficient and accurate prompt optimization. Specifically, we design an exemplar-guided reflection mechanism where the feedback generation is additionally guided by the generated exemplars. We further build two kinds of memory to fully utilize the historical feedback information and support more effective exemplar retrieval. Empirical evaluations show our method surpasses previous state-of-the-arts with less optimization steps, i.e., improving F1 score by 10.1 on LIAR dataset, and reducing half of the optimization steps on ProTeGi. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07446v1-abstract-full').style.display = 'none'; document.getElementById('2411.07446v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05005">arXiv:2411.05005</a> <span> [<a href="https://arxiv.org/pdf/2411.05005">pdf</a>, <a href="https://arxiv.org/format/2411.05005">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Diff-2-in-1: Bridging Generation and Dense Perception with Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuhong Zheng</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Z">Zhipeng Bao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruoyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Hebert%2C+M">Martial Hebert</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu-Xiong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05005v1-abstract-short" style="display: inline;"> Beyond high-fidelity image synthesis, diffusion models have recently exhibited promising results in dense visual perception tasks. However, most existing work treats diffusion models as a standalone component for perception tasks, employing them either solely for off-the-shelf data augmentation or as mere feature extractors. In contrast to these isolated and thus sub-optimal efforts, we introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05005v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05005v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05005v1-abstract-full" style="display: none;"> Beyond high-fidelity image synthesis, diffusion models have recently exhibited promising results in dense visual perception tasks. However, most existing work treats diffusion models as a standalone component for perception tasks, employing them either solely for off-the-shelf data augmentation or as mere feature extractors. In contrast to these isolated and thus sub-optimal efforts, we introduce a unified, versatile, diffusion-based framework, Diff-2-in-1, that can simultaneously handle both multi-modal data generation and dense visual perception, through a unique exploitation of the diffusion-denoising process. Within this framework, we further enhance discriminative visual perception via multi-modal generation, by utilizing the denoising network to create multi-modal data that mirror the distribution of the original training set. Importantly, Diff-2-in-1 optimizes the utilization of the created diverse and faithful data by leveraging a novel self-improving learning mechanism. Comprehensive experimental evaluations validate the effectiveness of our framework, showcasing consistent performance improvements across various discriminative backbones and high-quality multi-modal data generation characterized by both realism and usefulness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05005v1-abstract-full').style.display = 'none'; document.getElementById('2411.05005v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04715">arXiv:2411.04715</a> <span> [<a href="https://arxiv.org/pdf/2411.04715">pdf</a>, <a href="https://arxiv.org/format/2411.04715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> NeuroFly: A framework for whole-brain single neuron reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rubin Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+Z">Zijian Yi</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yanyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fang Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pencheng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04715v1-abstract-short" style="display: inline;"> Neurons, with their elongated, tree-like dendritic and axonal structures, enable efficient signal integration and long-range communication across brain regions. By reconstructing individual neurons' morphology, we can gain valuable insights into brain connectivity, revealing the structure basis of cognition, movement, and perception. Despite the accumulation of extensive 3D microscopic imaging dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04715v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04715v1-abstract-full" style="display: none;"> Neurons, with their elongated, tree-like dendritic and axonal structures, enable efficient signal integration and long-range communication across brain regions. By reconstructing individual neurons' morphology, we can gain valuable insights into brain connectivity, revealing the structure basis of cognition, movement, and perception. Despite the accumulation of extensive 3D microscopic imaging data, progress has been considerably hindered by the absence of automated tools to streamline this process. Here we introduce NeuroFly, a validated framework for large-scale automatic single neuron reconstruction. This framework breaks down the process into three distinct stages: segmentation, connection, and proofreading. In the segmentation stage, we perform automatic segmentation followed by skeletonization to generate over-segmented neuronal fragments without branches. During the connection stage, we use a 3D image-based path following approach to extend each fragment and connect it with other fragments of the same neuron. Finally, human annotators are required only to proofread the few unresolved positions. The first two stages of our process are clearly defined computer vision problems, and we have trained robust baseline models to solve them. We validated NeuroFly's efficiency using in-house datasets that include a variety of challenging scenarios, such as dense arborizations, weak axons, images with contamination. We will release the datasets along with a suite of visualization and annotation tools for better reproducibility. Our goal is to foster collaboration among researchers to address the neuron reconstruction challenge, ultimately accelerating advancements in neuroscience research. The dataset and code are available at https://github.com/beanli161514/neurofly <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04715v1-abstract-full').style.display = 'none'; document.getElementById('2411.04715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02299">arXiv:2411.02299</a> <span> [<a href="https://arxiv.org/pdf/2411.02299">pdf</a>, <a href="https://arxiv.org/format/2411.02299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Grouped Discrete Representation for Object-Centric Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rongzhen Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+V">Vivienne Wang</a>, <a href="/search/cs?searchtype=author&query=Kannala%2C+J">Juho Kannala</a>, <a href="/search/cs?searchtype=author&query=Pajarinen%2C+J">Joni Pajarinen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02299v1-abstract-short" style="display: inline;"> Object-Centric Learning (OCL) can discover objects in images or videos by simply reconstructing the input. For better object discovery, representative OCL methods reconstruct the input as its Variational Autoencoder (VAE) intermediate representation, which suppresses pixel noises and promotes object separability by discretizing continuous super-pixels with template features. However, treating feat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02299v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02299v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02299v1-abstract-full" style="display: none;"> Object-Centric Learning (OCL) can discover objects in images or videos by simply reconstructing the input. For better object discovery, representative OCL methods reconstruct the input as its Variational Autoencoder (VAE) intermediate representation, which suppresses pixel noises and promotes object separability by discretizing continuous super-pixels with template features. However, treating features as units overlooks their composing attributes, thus impeding model generalization; indexing features with scalar numbers loses attribute-level similarities and differences, thus hindering model convergence. We propose \textit{Grouped Discrete Representation} (GDR) for OCL. We decompose features into combinatorial attributes via organized channel grouping, and compose these attributes into discrete representation via tuple indexes. Experiments show that our GDR improves both Transformer- and Diffusion-based OCL methods consistently on various datasets. Visualizations show that our GDR captures better object separability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02299v1-abstract-full').style.display = 'none'; document.getElementById('2411.02299v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00726">arXiv:2411.00726</a> <span> [<a href="https://arxiv.org/pdf/2411.00726">pdf</a>, <a href="https://arxiv.org/format/2411.00726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-Fundus Transformer for Multi-modal Diabetic Retinopathy Grading with Cataract </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+F">Fan Xiao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junlin Hou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruiwei Zhao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+R">Rui Feng</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H">Haidong Zou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lina Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yi Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Juzhao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00726v1-abstract-short" style="display: inline;"> Diabetic retinopathy (DR) is a leading cause of blindness worldwide and a common complication of diabetes. As two different imaging tools for DR grading, color fundus photography (CFP) and infrared fundus photography (IFP) are highly-correlated and complementary in clinical applications. To the best of our knowledge, this is the first study that explores a novel multi-modal deep learning framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00726v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00726v1-abstract-full" style="display: none;"> Diabetic retinopathy (DR) is a leading cause of blindness worldwide and a common complication of diabetes. As two different imaging tools for DR grading, color fundus photography (CFP) and infrared fundus photography (IFP) are highly-correlated and complementary in clinical applications. To the best of our knowledge, this is the first study that explores a novel multi-modal deep learning framework to fuse the information from CFP and IFP towards more accurate DR grading. Specifically, we construct a dual-stream architecture Cross-Fundus Transformer (CFT) to fuse the ViT-based features of two fundus image modalities. In particular, a meticulously engineered Cross-Fundus Attention (CFA) module is introduced to capture the correspondence between CFP and IFP images. Moreover, we adopt both the single-modality and multi-modality supervisions to maximize the overall performance for DR grading. Extensive experiments on a clinical dataset consisting of 1,713 pairs of multi-modal fundus images demonstrate the superiority of our proposed method. Our code will be released for public access. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00726v1-abstract-full').style.display = 'none'; document.getElementById('2411.00726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22078">arXiv:2410.22078</a> <span> [<a href="https://arxiv.org/pdf/2410.22078">pdf</a>, <a href="https://arxiv.org/format/2410.22078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DINeuro: Distilling Knowledge from 2D Natural Images via Deformable Tubular Transferring Strategy for 3D Neuron Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Y+S">Yik San Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Runkai Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hanchuan Peng</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+Y">Yui Lo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuqian Chen</a>, <a href="/search/cs?searchtype=author&query=O%27Donnell%2C+L+J">Lauren J. O'Donnell</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Weidong Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22078v2-abstract-short" style="display: inline;"> Reconstructing neuron morphology from 3D light microscope imaging data is critical to aid neuroscientists in analyzing brain networks and neuroanatomy. With the boost from deep learning techniques, a variety of learning-based segmentation models have been developed to enhance the signal-to-noise ratio of raw neuron images as a pre-processing step in the reconstruction workflow. However, most exist… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22078v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22078v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22078v2-abstract-full" style="display: none;"> Reconstructing neuron morphology from 3D light microscope imaging data is critical to aid neuroscientists in analyzing brain networks and neuroanatomy. With the boost from deep learning techniques, a variety of learning-based segmentation models have been developed to enhance the signal-to-noise ratio of raw neuron images as a pre-processing step in the reconstruction workflow. However, most existing models directly encode the latent representative features of volumetric neuron data but neglect their intrinsic morphological knowledge. To address this limitation, we design a novel framework that distills the prior knowledge from a 2D Vision Transformer pre-trained on extensive 2D natural images to facilitate neuronal morphological learning of our 3D Vision Transformer. To bridge the knowledge gap between the 2D natural image and 3D microscopic morphologic domains, we propose a deformable tubular transferring strategy that adapts the pre-trained 2D natural knowledge to the inherent tubular characteristics of neuronal structure in the latent embedding space. The experimental results on the Janelia dataset of the BigNeuron project demonstrate that our method achieves a segmentation performance improvement of 4.53% in mean Dice and 3.56% in mean 95% Hausdorff distance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22078v2-abstract-full').style.display = 'none'; document.getElementById('2410.22078v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 figures, and 2 tables. This work has been accepted to 2025 IEEE 22nd International Symposium on Biomedical Imaging (ISBI)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22076">arXiv:2410.22076</a> <span> [<a href="https://arxiv.org/pdf/2410.22076">pdf</a>, <a href="https://arxiv.org/format/2410.22076">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> USpeech: Ultrasound-Enhanced Speech with Minimal Human Effort via Cross-Modal Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+L+J">Luca Jiang-Tao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Running Zhao</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Sijie Ji</a>, <a href="/search/cs?searchtype=author&query=Ngai%2C+E+C+H">Edith C. H. Ngai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenshu Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22076v1-abstract-short" style="display: inline;"> Speech enhancement is crucial in human-computer interaction, especially for ubiquitous devices. Ultrasound-based speech enhancement has emerged as an attractive choice because of its superior ubiquity and performance. However, inevitable interference from unexpected and unintended sources during audio-ultrasound data acquisition makes existing solutions rely heavily on human effort for data collec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22076v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22076v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22076v1-abstract-full" style="display: none;"> Speech enhancement is crucial in human-computer interaction, especially for ubiquitous devices. Ultrasound-based speech enhancement has emerged as an attractive choice because of its superior ubiquity and performance. However, inevitable interference from unexpected and unintended sources during audio-ultrasound data acquisition makes existing solutions rely heavily on human effort for data collection and processing. This leads to significant data scarcity that limits the full potential of ultrasound-based speech enhancement. To address this, we propose USpeech, a cross-modal ultrasound synthesis framework for speech enhancement with minimal human effort. At its core is a two-stage framework that establishes correspondence between visual and ultrasonic modalities by leveraging audible audio as a bridge. This approach overcomes challenges from the lack of paired video-ultrasound datasets and the inherent heterogeneity between video and ultrasound data. Our framework incorporates contrastive video-audio pre-training to project modalities into a shared semantic space and employs an audio-ultrasound encoder-decoder for ultrasound synthesis. We then present a speech enhancement network that enhances speech in the time-frequency domain and recovers the clean speech waveform via a neural vocoder. Comprehensive experiments show USpeech achieves remarkable performance using synthetic ultrasound data comparable to physical data, significantly outperforming state-of-the-art ultrasound-based speech enhancement baselines. USpeech is open-sourced at https://github.com/aiot-lab/USpeech/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22076v1-abstract-full').style.display = 'none'; document.getElementById('2410.22076v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21112">arXiv:2410.21112</a> <span> [<a href="https://arxiv.org/pdf/2410.21112">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> </div> </div> <p class="title is-5 mathjax"> Magnetic Milli-spinner for Robotic Endovascular Surgery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shuai Wu</a>, <a href="/search/cs?searchtype=author&query=Leanza%2C+S">Sophie Leanza</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lu Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yilong Chang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&query=Stone%2C+D">Diego Stone</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R+R">Ruike Renee Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21112v1-abstract-short" style="display: inline;"> Vascular diseases such as thrombosis, atherosclerosis, and aneurysm, which can lead to blockage of blood flow or blood vessel rupture, are common and life-threatening. Conventional minimally invasive treatments utilize catheters, or long tubes, to guide small devices or therapeutic agents to targeted regions for intervention. Unfortunately, catheters suffer from difficult and unreliable navigation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21112v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21112v1-abstract-full" style="display: none;"> Vascular diseases such as thrombosis, atherosclerosis, and aneurysm, which can lead to blockage of blood flow or blood vessel rupture, are common and life-threatening. Conventional minimally invasive treatments utilize catheters, or long tubes, to guide small devices or therapeutic agents to targeted regions for intervention. Unfortunately, catheters suffer from difficult and unreliable navigation in narrow, winding vessels such as those found in the brain. Magnetically actuated untethered robots, which have been extensively explored as an alternative, are promising for navigation in complex vasculatures and vascular disease treatments. Most current robots, however, cannot swim against high flows or are inadequate in treating certain conditions. Here, we introduce a multifunctional and magnetically actuated milli-spinner robot for rapid navigation and performance of various treatments in complicated vasculatures. The milli-spinner, with a unique hollow structure including helical fins and slits for propulsion, generates a distinct flow field upon spinning. The milli-spinner is the fastest-ever untethered magnetic robot for movement in tubular environments, easily achieving speeds of 23 cm/s, demonstrating promise as an untethered medical device for effective navigation in blood vessels and robotic treatment of numerous vascular diseases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21112v1-abstract-full').style.display = 'none'; document.getElementById('2410.21112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18242">arXiv:2410.18242</a> <span> [<a href="https://arxiv.org/pdf/2410.18242">pdf</a>, <a href="https://arxiv.org/format/2410.18242">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Human-Agent Coordination in Games under Incomplete Information via Multi-Step Intent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shenghui Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruihan Zhao</a>, <a href="/search/cs?searchtype=author&query=Chinchali%2C+S">Sandeep Chinchali</a>, <a href="/search/cs?searchtype=author&query=Topcu%2C+U">Ufuk Topcu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18242v2-abstract-short" style="display: inline;"> Strategic coordination between autonomous agents and human partners under incomplete information can be modeled as turn-based cooperative games. We extend a turn-based game under incomplete information, the shared-control game, to allow players to take multiple actions per turn rather than a single action. The extension enables the use of multi-step intent, which we hypothesize will improve perfor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18242v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18242v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18242v2-abstract-full" style="display: none;"> Strategic coordination between autonomous agents and human partners under incomplete information can be modeled as turn-based cooperative games. We extend a turn-based game under incomplete information, the shared-control game, to allow players to take multiple actions per turn rather than a single action. The extension enables the use of multi-step intent, which we hypothesize will improve performance in long-horizon tasks. To synthesize cooperative policies for the agent in this extended game, we propose an approach featuring a memory module for a running probabilistic belief of the environment dynamics and an online planning algorithm called IntentMCTS. This algorithm strategically selects the next action by leveraging any communicated multi-step intent via reward augmentation while considering the current belief. Agent-to-agent simulations in the Gnomes at Night testbed demonstrate that IntentMCTS requires fewer steps and control switches than baseline methods. A human-agent user study corroborates these findings, showing an 18.52% higher success rate compared to the heuristic baseline and a 5.56% improvement over the single-step prior work. Participants also report lower cognitive load, frustration, and higher satisfaction with the IntentMCTS agent partner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18242v2-abstract-full').style.display = 'none'; document.getElementById('2410.18242v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17268">arXiv:2410.17268</a> <span> [<a href="https://arxiv.org/pdf/2410.17268">pdf</a>, <a href="https://arxiv.org/format/2410.17268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SPikE-SSM: A Sparse, Precise, and Efficient Spiking State Space Model for Long Sequences Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yan Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruoyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chao Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qinghai Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhichao Lu</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+L">Luziwei Leng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17268v1-abstract-short" style="display: inline;"> Spiking neural networks (SNNs) provide an energy-efficient solution by utilizing the spike-based and sparse nature of biological systems. Since the advent of Transformers, SNNs have struggled to compete with artificial networks on long sequential tasks, until the recent emergence of state space models (SSMs), which offer superior computational efficiency and modeling capability. However, applying… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17268v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17268v1-abstract-full" style="display: none;"> Spiking neural networks (SNNs) provide an energy-efficient solution by utilizing the spike-based and sparse nature of biological systems. Since the advent of Transformers, SNNs have struggled to compete with artificial networks on long sequential tasks, until the recent emergence of state space models (SSMs), which offer superior computational efficiency and modeling capability. However, applying the highly capable SSMs to SNNs for long sequences learning poses three major challenges: (1) The membrane potential is determined by the past spiking history of the neuron, leading to reduced efficiency for sequence modeling in parallel computing scenarios. (2) Complex dynamics of biological spiking neurons are crucial for functionality but challenging to simulate and exploit effectively in large networks. (3) It is arduous to maintain high sparsity while achieving high accuracy for spiking neurons without resorting to dense computing, as utilized in artificial neuron-based SSMs. To address them, we propose a sparse, precise and efficient spiking SSM framework, termed SPikE-SSM. For (1), we propose a boundary compression strategy (PMBC) to accelerate the inference of the spiking neuron model, enabling parallel processing for long sequence learning. For (2), we propose a novel and concise neuron model incorporating reset-refractory mechanism to leverage the inherent temporal dimension for dynamic computing with biological interpretability. For (3), we hierarchically integrate the proposed neuron model to the original SSM block, and enhance the dynamics of SPikE-SSM by incorporating trainable thresholds and refractory magnitudes to balance accuracy and sparsity. Extensive experiments verify the effectiveness and robustness of SPikE-SSM on the long range arena benchmarks and large language dataset WikiText-103, showing the potential of dynamic spiking neurons in efficient long sequence learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17268v1-abstract-full').style.display = 'none'; document.getElementById('2410.17268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16653">arXiv:2410.16653</a> <span> [<a href="https://arxiv.org/pdf/2410.16653">pdf</a>, <a href="https://arxiv.org/format/2410.16653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1609/aiide.v20i1.31871">10.1609/aiide.v20i1.31871 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Two-Player Performance Through Single-Player Knowledge Transfer: An Empirical Study on Atari 2600 Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saadat%2C+K">Kimiya Saadat</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Richard Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16653v1-abstract-short" style="display: inline;"> Playing two-player games using reinforcement learning and self-play can be challenging due to the complexity of two-player environments and the possible instability in the training process. We propose that a reinforcement learning algorithm can train more efficiently and achieve improved performance in a two-player game if it leverages the knowledge from the single-player version of the same game.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16653v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16653v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16653v1-abstract-full" style="display: none;"> Playing two-player games using reinforcement learning and self-play can be challenging due to the complexity of two-player environments and the possible instability in the training process. We propose that a reinforcement learning algorithm can train more efficiently and achieve improved performance in a two-player game if it leverages the knowledge from the single-player version of the same game. This study examines the proposed idea in ten different Atari 2600 environments using the Atari 2600 RAM as the input state. We discuss the advantages of using transfer learning from a single-player training process over training in a two-player setting from scratch, and demonstrate our results in a few measures such as training time and average total reward. We also discuss a method of calculating RAM complexity and its relationship to performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16653v1-abstract-full').style.display = 'none'; document.getElementById('2410.16653v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the Twentieth AAAI Conference on Artificial Intelligence and Interactive Digital Entertainment (AIIDE-24), Lexington, USA, November, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15644">arXiv:2410.15644</a> <span> [<a href="https://arxiv.org/pdf/2410.15644">pdf</a>, <a href="https://arxiv.org/format/2410.15644">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1609/aiide.v20i1.31877">10.1609/aiide.v20i1.31877 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Procedural Content Generation in Games: A Survey with Insights on Emerging LLM Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Maleki%2C+M+F">Mahdi Farrokhi Maleki</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Richard Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15644v1-abstract-short" style="display: inline;"> Procedural Content Generation (PCG) is defined as the automatic creation of game content using algorithms. PCG has a long history in both the game industry and the academic world. It can increase player engagement and ease the work of game designers. While recent advances in deep learning approaches in PCG have enabled researchers and practitioners to create more sophisticated content, it is the a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15644v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15644v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15644v1-abstract-full" style="display: none;"> Procedural Content Generation (PCG) is defined as the automatic creation of game content using algorithms. PCG has a long history in both the game industry and the academic world. It can increase player engagement and ease the work of game designers. While recent advances in deep learning approaches in PCG have enabled researchers and practitioners to create more sophisticated content, it is the arrival of Large Language Models (LLMs) that truly disrupted the trajectory of PCG advancement. This survey explores the differences between various algorithms used for PCG, including search-based methods, machine learning-based methods, other frequently used methods (e.g., noise functions), and the newcomer, LLMs. We also provide a detailed discussion on combined methods. Furthermore, we compare these methods based on the type of content they generate and the publication dates of their respective papers. Finally, we identify gaps in the existing academic work and suggest possible directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15644v1-abstract-full').style.display = 'none'; document.getElementById('2410.15644v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the Twentieth AAAI Conference on Artificial Intelligence and Interactive Digital Entertainment (AIIDE-24), Lexington, USA, November, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15205">arXiv:2410.15205</a> <span> [<a href="https://arxiv.org/pdf/2410.15205">pdf</a>, <a href="https://arxiv.org/format/2410.15205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> DTPPO: Dual-Transformer Encoder-based Proximal Policy Optimization for Multi-UAV Navigation in Unseen Complex Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+A">Anning Wei</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jintao Liang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kaiyuan Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziyue Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15205v1-abstract-short" style="display: inline;"> Existing multi-agent deep reinforcement learning (MADRL) methods for multi-UAV navigation face challenges in generalization, particularly when applied to unseen complex environments. To address these limitations, we propose a Dual-Transformer Encoder-based Proximal Policy Optimization (DTPPO) method. DTPPO enhances multi-UAV collaboration through a Spatial Transformer, which models inter-agent dyn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15205v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15205v1-abstract-full" style="display: none;"> Existing multi-agent deep reinforcement learning (MADRL) methods for multi-UAV navigation face challenges in generalization, particularly when applied to unseen complex environments. To address these limitations, we propose a Dual-Transformer Encoder-based Proximal Policy Optimization (DTPPO) method. DTPPO enhances multi-UAV collaboration through a Spatial Transformer, which models inter-agent dynamics, and a Temporal Transformer, which captures temporal dependencies to improve generalization across diverse environments. This architecture allows UAVs to navigate new, unseen environments without retraining. Extensive simulations demonstrate that DTPPO outperforms current MADRL methods in terms of transferability, obstacle avoidance, and navigation efficiency across environments with varying obstacle densities. The results confirm DTPPO's effectiveness as a robust solution for multi-UAV navigation in both known and unseen scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15205v1-abstract-full').style.display = 'none'; document.getElementById('2410.15205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13861">arXiv:2410.13861</a> <span> [<a href="https://arxiv.org/pdf/2410.13861">pdf</a>, <a href="https://arxiv.org/format/2410.13861">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PUMA: Empowering Unified MLLM with Multi-granular Visual Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+R">Rongyao Fang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+C">Chengqi Duan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xingyu Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongsheng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xihui Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13861v2-abstract-short" style="display: inline;"> Recent advancements in multimodal foundation models have yielded significant progress in vision-language understanding. Initial attempts have also explored the potential of multimodal large language models (MLLMs) for visual content generation. However, existing works have insufficiently addressed the varying granularity demands of different image generation tasks within a unified MLLM paradigm -… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13861v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13861v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13861v2-abstract-full" style="display: none;"> Recent advancements in multimodal foundation models have yielded significant progress in vision-language understanding. Initial attempts have also explored the potential of multimodal large language models (MLLMs) for visual content generation. However, existing works have insufficiently addressed the varying granularity demands of different image generation tasks within a unified MLLM paradigm - from the diversity required in text-to-image generation to the precise controllability needed in image manipulation. In this work, we propose PUMA, emPowering Unified MLLM with Multi-grAnular visual generation. PUMA unifies multi-granular visual features as both inputs and outputs of MLLMs, elegantly addressing the different granularity requirements of various image generation tasks within a unified MLLM framework. Following multimodal pretraining and task-specific instruction tuning, PUMA demonstrates proficiency in a wide range of multimodal tasks. This work represents a significant step towards a truly unified MLLM capable of adapting to the granularity demands of various visual tasks. The code and model will be released in https://github.com/rongyaofang/PUMA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13861v2-abstract-full').style.display = 'none'; document.getElementById('2410.13861v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://rongyaofang.github.io/puma/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13830">arXiv:2410.13830</a> <span> [<a href="https://arxiv.org/pdf/2410.13830">pdf</a>, <a href="https://arxiv.org/format/2410.13830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DreamVideo-2: Zero-Shot Subject-Driven Video Customization with Precise Motion Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yujie Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hangjie Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+H">Haonan Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yutong Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhizhong Huang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jiaxin Ye</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yingya Zhang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+H">Hongming Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13830v1-abstract-short" style="display: inline;"> Recent advances in customized video generation have enabled users to create videos tailored to both specific subjects and motion trajectories. However, existing methods often require complicated test-time fine-tuning and struggle with balancing subject learning and motion control, limiting their real-world applications. In this paper, we present DreamVideo-2, a zero-shot video customization framew… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13830v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13830v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13830v1-abstract-full" style="display: none;"> Recent advances in customized video generation have enabled users to create videos tailored to both specific subjects and motion trajectories. However, existing methods often require complicated test-time fine-tuning and struggle with balancing subject learning and motion control, limiting their real-world applications. In this paper, we present DreamVideo-2, a zero-shot video customization framework capable of generating videos with a specific subject and motion trajectory, guided by a single image and a bounding box sequence, respectively, and without the need for test-time fine-tuning. Specifically, we introduce reference attention, which leverages the model's inherent capabilities for subject learning, and devise a mask-guided motion module to achieve precise motion control by fully utilizing the robust motion signal of box masks derived from bounding boxes. While these two components achieve their intended functions, we empirically observe that motion control tends to dominate over subject learning. To address this, we propose two key designs: 1) the masked reference attention, which integrates a blended latent mask modeling scheme into reference attention to enhance subject representations at the desired positions, and 2) a reweighted diffusion loss, which differentiates the contributions of regions inside and outside the bounding boxes to ensure a balance between subject and motion control. Extensive experimental results on a newly curated dataset demonstrate that DreamVideo-2 outperforms state-of-the-art methods in both subject customization and motion control. The dataset, code, and models will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13830v1-abstract-full').style.display = 'none'; document.getElementById('2410.13830v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://dreamvideo2.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13185">arXiv:2410.13185</a> <span> [<a href="https://arxiv.org/pdf/2410.13185">pdf</a>, <a href="https://arxiv.org/format/2410.13185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Chain of Ideas: Revolutionizing Research Via Novel Idea Development with LLM Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Long Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weiwen Xu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiayan Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruochen Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingxuan Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuqian Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuming Jiang</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Y">Yifei Xin</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+R">Ronghao Dang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Deli Zhao</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+Y">Yu Rong</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tian Feng</a>, <a href="/search/cs?searchtype=author&query=Bing%2C+L">Lidong Bing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13185v5-abstract-short" style="display: inline;"> Effective research ideation is a critical step for scientific research. However, the exponential increase in scientific literature makes it challenging for researchers to stay current with recent advances and identify meaningful research directions. Recent developments in large language models~(LLMs) suggest a promising avenue for automating the generation of novel research ideas. However, existin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13185v5-abstract-full').style.display = 'inline'; document.getElementById('2410.13185v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13185v5-abstract-full" style="display: none;"> Effective research ideation is a critical step for scientific research. However, the exponential increase in scientific literature makes it challenging for researchers to stay current with recent advances and identify meaningful research directions. Recent developments in large language models~(LLMs) suggest a promising avenue for automating the generation of novel research ideas. However, existing methods for idea generation either trivially prompt LLMs or directly expose LLMs to extensive literature without indicating useful information. Inspired by the research process of human researchers, we propose a Chain-of-Ideas~(CoI) agent, an LLM-based agent that organizes relevant literature in a chain structure to effectively mirror the progressive development in a research domain. This organization facilitates LLMs to capture the current advancements in research, thereby enhancing their ideation capabilities. Furthermore, we propose Idea Arena, an evaluation protocol that can comprehensively evaluate idea generation methods from different perspectives, aligning closely with the preferences of human researchers. Experimental results indicate that the CoI agent consistently outperforms other methods and shows comparable quality as humans in research idea generation. Moreover, our CoI agent is budget-friendly, with a minimum cost of \$0.50 to generate a candidate idea and its corresponding experimental design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13185v5-abstract-full').style.display = 'none'; document.getElementById('2410.13185v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages,5 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07133">arXiv:2410.07133</a> <span> [<a href="https://arxiv.org/pdf/2410.07133">pdf</a>, <a href="https://arxiv.org/format/2410.07133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EvolveDirector: Approaching Advanced Text-to-Image Generation with Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hangjie Yuan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yujie Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuchao Gu</a>, <a href="/search/cs?searchtype=author&query=Ran%2C+L">Lingmin Ran</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhangjie Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yingya Zhang</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+M+Z">Mike Zheng Shou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07133v2-abstract-short" style="display: inline;"> Recent advancements in generation models have showcased remarkable capabilities in generating fantastic content. However, most of them are trained on proprietary high-quality data, and some models withhold their parameters and only provide accessible application programming interfaces (APIs), limiting their benefits for downstream tasks. To explore the feasibility of training a text-to-image gener… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07133v2-abstract-full').style.display = 'inline'; document.getElementById('2410.07133v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07133v2-abstract-full" style="display: none;"> Recent advancements in generation models have showcased remarkable capabilities in generating fantastic content. However, most of them are trained on proprietary high-quality data, and some models withhold their parameters and only provide accessible application programming interfaces (APIs), limiting their benefits for downstream tasks. To explore the feasibility of training a text-to-image generation model comparable to advanced models using publicly available resources, we introduce EvolveDirector. This framework interacts with advanced models through their public APIs to obtain text-image data pairs to train a base model. Our experiments with extensive data indicate that the model trained on generated data of the advanced model can approximate its generation capability. However, it requires large-scale samples of 10 million or more. This incurs significant expenses in time, computational resources, and especially the costs associated with calling fee-based APIs. To address this problem, we leverage pre-trained large vision-language models (VLMs) to guide the evolution of the base model. VLM continuously evaluates the base model during training and dynamically updates and refines the training dataset by the discrimination, expansion, deletion, and mutation operations. Experimental results show that this paradigm significantly reduces the required data volume. Furthermore, when approaching multiple advanced models, EvolveDirector can select the best samples generated by them to learn powerful and balanced abilities. The final trained model Edgen is demonstrated to outperform these advanced models. The code and model weights are available at https://github.com/showlab/EvolveDirector. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07133v2-abstract-full').style.display = 'none'; document.getElementById('2410.07133v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+R&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository