Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,352 results for author: <span class="mathjax">Shi, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Shi%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Shi, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Shi%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Shi, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14025">arXiv:2411.14025</a> <span> [<a href="https://arxiv.org/pdf/2411.14025">pdf</a>, <a href="https://arxiv.org/format/2411.14025">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> RISecure-PUF: Multipurpose PUF-Driven Security Extensions with Lookaside Buffer in RISC-V </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chenghao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaolin Zhang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+K">Kailun Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tengfei Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yipeng Shi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tianyi Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+D">Dawu Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14025v1-abstract-short" style="display: inline;"> RISC-V's limited security features hinder its use in confidential computing and heterogeneous platforms. This paper introduces RISecure-PUF, a security extension utilizing existing Physical Unclonable Functions for key generation and secure protocol purposes. A one-way hash function is integrated to ensure provable security against modeling attacks, while a lookaside buffer accelerates batch sampl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14025v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14025v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14025v1-abstract-full" style="display: none;"> RISC-V's limited security features hinder its use in confidential computing and heterogeneous platforms. This paper introduces RISecure-PUF, a security extension utilizing existing Physical Unclonable Functions for key generation and secure protocol purposes. A one-way hash function is integrated to ensure provable security against modeling attacks, while a lookaside buffer accelerates batch sampling and minimizes reliance on error correction codes. Implemented on the Genesys 2 FPGA, RISecure-PUF improves at least $2.72\times$ in batch scenarios with negligible hardware overhead and a maximum performance reduction of $10.7\%$, enabled by reusing the hash function module in integrated environments such as cryptographic engines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14025v1-abstract-full').style.display = 'none'; document.getElementById('2411.14025v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13766">arXiv:2411.13766</a> <span> [<a href="https://arxiv.org/pdf/2411.13766">pdf</a>, <a href="https://arxiv.org/format/2411.13766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Tiny-Align: Bridging Automatic Speech Recognition and Large Language Model on the Edge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+R">Ruiyang Qin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dancheng Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Gelei Xu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zheyu Yan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenhui Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuting Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X+S">X. Sharon Hu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Jinjun Xiong</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yiyu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13766v1-abstract-short" style="display: inline;"> The combination of Large Language Models (LLM) and Automatic Speech Recognition (ASR), when deployed on edge devices (called edge ASR-LLM), can serve as a powerful personalized assistant to enable audio-based interaction for users. Compared to text-based interaction, edge ASR-LLM allows accessible and natural audio interactions. Unfortunately, existing ASR-LLM models are mainly trained in high-per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13766v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13766v1-abstract-full" style="display: none;"> The combination of Large Language Models (LLM) and Automatic Speech Recognition (ASR), when deployed on edge devices (called edge ASR-LLM), can serve as a powerful personalized assistant to enable audio-based interaction for users. Compared to text-based interaction, edge ASR-LLM allows accessible and natural audio interactions. Unfortunately, existing ASR-LLM models are mainly trained in high-performance computing environments and produce substantial model weights, making them difficult to deploy on edge devices. More importantly, to better serve users' personalized needs, the ASR-LLM must be able to learn from each distinct user, given that audio input often contains highly personalized characteristics that necessitate personalized on-device training. Since individually fine-tuning the ASR or LLM often leads to suboptimal results due to modality-specific limitations, end-to-end training ensures seamless integration of audio features and language understanding (cross-modal alignment), ultimately enabling a more personalized and efficient adaptation on edge devices. However, due to the complex training requirements and substantial computational demands of existing approaches, cross-modal alignment between ASR audio and LLM can be challenging on edge devices. In this work, we propose a resource-efficient cross-modal alignment framework that bridges ASR and LLMs on edge devices to handle personalized audio input. Our framework enables efficient ASR-LLM alignment on resource-constrained devices like NVIDIA Jetson Orin (8GB RAM), achieving 50x training time speedup while improving the alignment quality by more than 50\%. To the best of our knowledge, this is the first work to study efficient ASR-LLM alignment on resource-constrained edge devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13766v1-abstract-full').style.display = 'none'; document.getElementById('2411.13766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13560">arXiv:2411.13560</a> <span> [<a href="https://arxiv.org/pdf/2411.13560">pdf</a>, <a href="https://arxiv.org/format/2411.13560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> AMSnet-KG: A Netlist Dataset for LLM-based AMS Circuit Auto-Design Using Knowledge Graph RAG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yichen Shi</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Z">Zhuofu Tao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuhao Gao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tianjia Zhou</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaxing Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Genhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Alvin Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiping Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Ting-Jung Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13560v1-abstract-short" style="display: inline;"> High-performance analog and mixed-signal (AMS) circuits are mainly full-custom designed, which is time-consuming and labor-intensive. A significant portion of the effort is experience-driven, which makes the automation of AMS circuit design a formidable challenge. Large language models (LLMs) have emerged as powerful tools for Electronic Design Automation (EDA) applications, fostering advancements… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13560v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13560v1-abstract-full" style="display: none;"> High-performance analog and mixed-signal (AMS) circuits are mainly full-custom designed, which is time-consuming and labor-intensive. A significant portion of the effort is experience-driven, which makes the automation of AMS circuit design a formidable challenge. Large language models (LLMs) have emerged as powerful tools for Electronic Design Automation (EDA) applications, fostering advancements in the automatic design process for large-scale AMS circuits. However, the absence of high-quality datasets has led to issues such as model hallucination, which undermines the robustness of automatically generated circuit designs. To address this issue, this paper introduces AMSnet-KG, a dataset encompassing various AMS circuit schematics and netlists. We construct a knowledge graph with annotations on detailed functional and performance characteristics. Facilitated by AMSnet-KG, we propose an automated AMS circuit generation framework that utilizes the comprehensive knowledge embedded in LLMs. We first formulate a design strategy (e.g., circuit architecture using a number of circuit components) based on required specifications. Next, matched circuit components are retrieved and assembled into a complete topology, and transistor sizing is obtained through Bayesian optimization. Simulation results of the netlist are fed back to the LLM for further topology refinement, ensuring the circuit design specifications are met. We perform case studies of operational amplifier and comparator design to verify the automatic design flow from specifications to netlists with minimal human effort. The dataset used in this paper will be open-sourced upon publishing of this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13560v1-abstract-full').style.display = 'none'; document.getElementById('2411.13560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12783">arXiv:2411.12783</a> <span> [<a href="https://arxiv.org/pdf/2411.12783">pdf</a>, <a href="https://arxiv.org/format/2411.12783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Med-2E3: A 2D-Enhanced 3D Medical Multimodal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yiming Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xun Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Ying Hu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chenyi Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Miao Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Ji Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12783v1-abstract-short" style="display: inline;"> The analysis of 3D medical images is crucial for modern healthcare, yet traditional task-specific models are becoming increasingly inadequate due to limited generalizability across diverse clinical scenarios. Multimodal large language models (MLLMs) offer a promising solution to these challenges. However, existing MLLMs have limitations in fully leveraging the rich, hierarchical information embedd… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12783v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12783v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12783v1-abstract-full" style="display: none;"> The analysis of 3D medical images is crucial for modern healthcare, yet traditional task-specific models are becoming increasingly inadequate due to limited generalizability across diverse clinical scenarios. Multimodal large language models (MLLMs) offer a promising solution to these challenges. However, existing MLLMs have limitations in fully leveraging the rich, hierarchical information embedded in 3D medical images. Inspired by clinical practice, where radiologists focus on both 3D spatial structure and 2D planar content, we propose Med-2E3, a novel MLLM for 3D medical image analysis that integrates 3D and 2D encoders. To aggregate 2D features more effectively, we design a Text-Guided Inter-Slice (TG-IS) scoring module, which scores the attention of each 2D slice based on slice contents and task instructions. To the best of our knowledge, Med-2E3 is the first MLLM to integrate both 3D and 2D features for 3D medical image analysis. Experiments on a large-scale, open-source 3D medical multimodal benchmark demonstrate that Med-2E3 exhibits task-specific attention distribution and significantly outperforms current state-of-the-art models, with a 14% improvement in report generation and a 5% gain in medical visual question answering (VQA), highlighting the model's potential in addressing complex multimodal clinical tasks. The code will be released upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12783v1-abstract-full').style.display = 'none'; document.getElementById('2411.12783v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12547">arXiv:2411.12547</a> <span> [<a href="https://arxiv.org/pdf/2411.12547">pdf</a>, <a href="https://arxiv.org/format/2411.12547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> S3TU-Net: Structured Convolution and Superpixel Transformer for Lung Nodule Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuke Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yunyu Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenglei Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">YuQing Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S+H">Shuo Hong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12547v1-abstract-short" style="display: inline;"> The irregular and challenging characteristics of lung adenocarcinoma nodules in computed tomography (CT) images complicate staging diagnosis, making accurate segmentation critical for clinicians to extract detailed lesion information. In this study, we propose a segmentation model, S3TU-Net, which integrates multi-dimensional spatial connectors and a superpixel-based visual transformer. S3TU-Net i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12547v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12547v1-abstract-full" style="display: none;"> The irregular and challenging characteristics of lung adenocarcinoma nodules in computed tomography (CT) images complicate staging diagnosis, making accurate segmentation critical for clinicians to extract detailed lesion information. In this study, we propose a segmentation model, S3TU-Net, which integrates multi-dimensional spatial connectors and a superpixel-based visual transformer. S3TU-Net is built on a multi-view CNN-Transformer hybrid architecture, incorporating superpixel algorithms, structured weighting, and spatial shifting techniques to achieve superior segmentation performance. The model leverages structured convolution blocks (DWF-Conv/D2BR-Conv) to extract multi-scale local features while mitigating overfitting. To enhance multi-scale feature fusion, we introduce the S2-MLP Link, integrating spatial shifting and attention mechanisms at the skip connections. Additionally, the residual-based superpixel visual transformer (RM-SViT) effectively merges global and local features by employing sparse correlation learning and multi-branch attention to capture long-range dependencies, with residual connections enhancing stability and computational efficiency. Experimental results on the LIDC-IDRI dataset demonstrate that S3TU-Net achieves a DSC, precision, and IoU of 89.04%, 90.73%, and 90.70%, respectively. Compared to recent methods, S3TU-Net improves DSC by 4.52% and sensitivity by 3.16%, with other metrics showing an approximate 2% increase. In addition to comparison and ablation studies, we validated the generalization ability of our model on the EPDB private dataset, achieving a DSC of 86.40%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12547v1-abstract-full').style.display = 'none'; document.getElementById('2411.12547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11475">arXiv:2411.11475</a> <span> [<a href="https://arxiv.org/pdf/2411.11475">pdf</a>, <a href="https://arxiv.org/format/2411.11475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MVLight: Relightable Text-to-3D Generation via Light-conditioned Multi-View Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shim%2C+D">Dongseok Shim</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yichun Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kejie Li</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+J">H. Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11475v1-abstract-short" style="display: inline;"> Recent advancements in text-to-3D generation, building on the success of high-performance text-to-image generative models, have made it possible to create imaginative and richly textured 3D objects from textual descriptions. However, a key challenge remains in effectively decoupling light-independent and lighting-dependent components to enhance the quality of generated 3D models and their relighti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11475v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11475v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11475v1-abstract-full" style="display: none;"> Recent advancements in text-to-3D generation, building on the success of high-performance text-to-image generative models, have made it possible to create imaginative and richly textured 3D objects from textual descriptions. However, a key challenge remains in effectively decoupling light-independent and lighting-dependent components to enhance the quality of generated 3D models and their relighting performance. In this paper, we present MVLight, a novel light-conditioned multi-view diffusion model that explicitly integrates lighting conditions directly into the generation process. This enables the model to synthesize high-quality images that faithfully reflect the specified lighting environment across multiple camera views. By leveraging this capability to Score Distillation Sampling (SDS), we can effectively synthesize 3D models with improved geometric precision and relighting capabilities. We validate the effectiveness of MVLight through extensive experiments and a user study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11475v1-abstract-full').style.display = 'none'; document.getElementById('2411.11475v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11348">arXiv:2411.11348</a> <span> [<a href="https://arxiv.org/pdf/2411.11348">pdf</a>, <a href="https://arxiv.org/format/2411.11348">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Modeling Multivariable High-resolution 3D Urban Microclimate Using Localized Fourier Neural Operator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+S">Shaoxiang Qin</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+D">Dongxue Zhan</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+D">Dingyang Geng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+W">Wenhui Peng</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+G">Geng Tian</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yurong Shi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+N">Naiping Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xue Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L+L">Liangzhu Leon Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11348v1-abstract-short" style="display: inline;"> Accurate urban microclimate analysis with wind velocity and temperature is vital for energy-efficient urban planning, supporting carbon reduction, enhancing public health and comfort, and advancing the low-altitude economy. However, traditional computational fluid dynamics (CFD) simulations that couple velocity and temperature are computationally expensive. Recent machine learning advancements off… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11348v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11348v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11348v1-abstract-full" style="display: none;"> Accurate urban microclimate analysis with wind velocity and temperature is vital for energy-efficient urban planning, supporting carbon reduction, enhancing public health and comfort, and advancing the low-altitude economy. However, traditional computational fluid dynamics (CFD) simulations that couple velocity and temperature are computationally expensive. Recent machine learning advancements offer promising alternatives for accelerating urban microclimate simulations. The Fourier neural operator (FNO) has shown efficiency and accuracy in predicting single-variable velocity magnitudes in urban wind fields. Yet, for multivariable high-resolution 3D urban microclimate prediction, FNO faces three key limitations: blurry output quality, high GPU memory demand, and substantial data requirements. To address these issues, we propose a novel localized Fourier neural operator (Local-FNO) model that employs local training, geometry encoding, and patch overlapping. Local-FNO provides accurate predictions for rapidly changing turbulence in urban microclimate over 60 seconds, four times the average turbulence integral time scale, with an average error of 0.35 m/s in velocity and 0.30 掳C in temperature. It also accurately captures turbulent heat flux represented by the velocity-temperature correlation. In a 2 km by 2 km domain, Local-FNO resolves turbulence patterns down to a 10 m resolution. It provides high-resolution predictions with 150 million feature dimensions on a single 32 GB GPU at nearly 50 times the speed of a CFD solver. Compared to FNO, Local-FNO achieves a 23.9% reduction in prediction error and a 47.3% improvement in turbulent fluctuation correlation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11348v1-abstract-full').style.display = 'none'; document.getElementById('2411.11348v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11282">arXiv:2411.11282</a> <span> [<a href="https://arxiv.org/pdf/2411.11282">pdf</a>, <a href="https://arxiv.org/format/2411.11282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Continuous K-space Recovery Network with Image Guidance for Fast MRI Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+Y">Yucong Meng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhiwei Yang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+M">Minghong Duan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yonghong Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhijian Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11282v1-abstract-short" style="display: inline;"> Magnetic resonance imaging (MRI) is a crucial tool for clinical diagnosis while facing the challenge of long scanning time. To reduce the acquisition time, fast MRI reconstruction aims to restore high-quality images from the undersampled k-space. Existing methods typically train deep learning models to map the undersampled data to artifact-free MRI images. However, these studies often overlook the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11282v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11282v1-abstract-full" style="display: none;"> Magnetic resonance imaging (MRI) is a crucial tool for clinical diagnosis while facing the challenge of long scanning time. To reduce the acquisition time, fast MRI reconstruction aims to restore high-quality images from the undersampled k-space. Existing methods typically train deep learning models to map the undersampled data to artifact-free MRI images. However, these studies often overlook the unique properties of k-space and directly apply general networks designed for image processing to k-space recovery, leaving the precise learning of k-space largely underexplored. In this work, we propose a continuous k-space recovery network from a new perspective of implicit neural representation with image domain guidance, which boosts the performance of MRI reconstruction. Specifically, (1) an implicit neural representation based encoder-decoder structure is customized to continuously query unsampled k-values. (2) an image guidance module is designed to mine the semantic information from the low-quality MRI images to further guide the k-space recovery. (3) a multi-stage training strategy is proposed to recover dense k-space progressively. Extensive experiments conducted on CC359, fastMRI, and IXI datasets demonstrate the effectiveness of our method and its superiority over other competitors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11282v1-abstract-full').style.display = 'none'; document.getElementById('2411.11282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11232">arXiv:2411.11232</a> <span> [<a href="https://arxiv.org/pdf/2411.11232">pdf</a>, <a href="https://arxiv.org/format/2411.11232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SAMOS: A Neural MOS Prediction Model Leveraging Semantic Representations and Acoustic Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yu-Fei Shi</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+Y">Yang Ai</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Ye-Xin Lu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hui-Peng Du</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11232v1-abstract-short" style="display: inline;"> Assessing the naturalness of speech using mean opinion score (MOS) prediction models has positive implications for the automatic evaluation of speech synthesis systems. Early MOS prediction models took the raw waveform or amplitude spectrum of speech as input, whereas more advanced methods employed self-supervised-learning (SSL) based models to extract semantic representations from speech for MOS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11232v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11232v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11232v1-abstract-full" style="display: none;"> Assessing the naturalness of speech using mean opinion score (MOS) prediction models has positive implications for the automatic evaluation of speech synthesis systems. Early MOS prediction models took the raw waveform or amplitude spectrum of speech as input, whereas more advanced methods employed self-supervised-learning (SSL) based models to extract semantic representations from speech for MOS prediction. These methods utilized limited aspects of speech information for MOS prediction, resulting in restricted prediction accuracy. Therefore, in this paper, we propose SAMOS, a MOS prediction model that leverages both Semantic and Acoustic information of speech to be assessed. Specifically, the proposed SAMOS leverages a pretrained wav2vec2 to extract semantic representations and uses the feature extractor of a pretrained BiVocoder to extract acoustic features. These two types of features are then fed into the prediction network, which includes multi-task heads and an aggregation layer, to obtain the final MOS score. Experimental results demonstrate that the proposed SAMOS outperforms current state-of-the-art MOS prediction models on the BVCC dataset and performs comparable performance on the BC2019 dataset, according to the results of system-level evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11232v1-abstract-full').style.display = 'none'; document.getElementById('2411.11232v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11123">arXiv:2411.11123</a> <span> [<a href="https://arxiv.org/pdf/2411.11123">pdf</a>, <a href="https://arxiv.org/format/2411.11123">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Pitch-and-Spectrum-Aware Singing Quality Assessment with Bias Correction and Model Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yu-Fei Shi</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+Y">Yang Ai</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Ye-Xin Lu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hui-Peng Du</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11123v1-abstract-short" style="display: inline;"> We participated in track 2 of the VoiceMOS Challenge 2024, which aimed to predict the mean opinion score (MOS) of singing samples. Our submission secured the first place among all participating teams, excluding the official baseline. In this paper, we further improve our submission and propose a novel Pitch-and-Spectrum-aware Singing Quality Assessment (PS-SQA) method. The PS-SQA is designed based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11123v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11123v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11123v1-abstract-full" style="display: none;"> We participated in track 2 of the VoiceMOS Challenge 2024, which aimed to predict the mean opinion score (MOS) of singing samples. Our submission secured the first place among all participating teams, excluding the official baseline. In this paper, we further improve our submission and propose a novel Pitch-and-Spectrum-aware Singing Quality Assessment (PS-SQA) method. The PS-SQA is designed based on the self-supervised-learning (SSL) MOS predictor, incorporating singing pitch and spectral information, which are extracted using pitch histogram and non-quantized neural codec, respectively. Additionally, the PS-SQA introduces a bias correction strategy to address prediction biases caused by low-resource training samples, and employs model fusion technology to further enhance prediction accuracy. Experimental results confirm that our proposed PS-SQA significantly outperforms all competing systems across all system-level metrics, confirming its strong sing quality assessment capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11123v1-abstract-full').style.display = 'none'; document.getElementById('2411.11123v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09891">arXiv:2411.09891</a> <span> [<a href="https://arxiv.org/pdf/2411.09891">pdf</a>, <a href="https://arxiv.org/format/2411.09891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Off-Dynamics Reinforcement Learning via Domain Adaptation and Reward Augmented Imitation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yihong Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuanyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Pan Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Anqi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09891v1-abstract-short" style="display: inline;"> Training a policy in a source domain for deployment in the target domain under a dynamics shift can be challenging, often resulting in performance degradation. Previous work tackles this challenge by training on the source domain with modified rewards derived by matching distributions between the source and the target optimal trajectories. However, pure modified rewards only ensure the behavior of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09891v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09891v1-abstract-full" style="display: none;"> Training a policy in a source domain for deployment in the target domain under a dynamics shift can be challenging, often resulting in performance degradation. Previous work tackles this challenge by training on the source domain with modified rewards derived by matching distributions between the source and the target optimal trajectories. However, pure modified rewards only ensure the behavior of the learned policy in the source domain resembles trajectories produced by the target optimal policies, which does not guarantee optimal performance when the learned policy is actually deployed to the target domain. In this work, we propose to utilize imitation learning to transfer the policy learned from the reward modification to the target domain so that the new policy can generate the same trajectories in the target domain. Our approach, Domain Adaptation and Reward Augmented Imitation Learning (DARAIL), utilizes the reward modification for domain adaptation and follows the general framework of generative adversarial imitation learning from observation (GAIfO) by applying a reward augmented estimator for the policy optimization step. Theoretically, we present an error bound for our method under a mild assumption regarding the dynamics shift to justify the motivation of our method. Empirically, our method outperforms the pure modified reward method without imitation learning and also outperforms other baselines in benchmark off-dynamics environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09891v1-abstract-full').style.display = 'none'; document.getElementById('2411.09891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at Neurips 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09219">arXiv:2411.09219</a> <span> [<a href="https://arxiv.org/pdf/2411.09219">pdf</a>, <a href="https://arxiv.org/format/2411.09219">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Harnessing Vision Foundation Models for High-Performance, Training-Free Open Vocabulary Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuheng Shi</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Minjing Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09219v1-abstract-short" style="display: inline;"> While Contrastive Language-Image Pre-training (CLIP) has advanced open-vocabulary predictions, its performance on semantic segmentation remains suboptimal. This shortfall primarily stems from its spatial-invariant semantic features and constrained resolution. While previous adaptations addressed spatial invariance semantic by modifying the self-attention in CLIP's image encoder, the issue of limit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09219v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09219v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09219v1-abstract-full" style="display: none;"> While Contrastive Language-Image Pre-training (CLIP) has advanced open-vocabulary predictions, its performance on semantic segmentation remains suboptimal. This shortfall primarily stems from its spatial-invariant semantic features and constrained resolution. While previous adaptations addressed spatial invariance semantic by modifying the self-attention in CLIP's image encoder, the issue of limited resolution remains unexplored. Different from previous segment-then-splice methods that segment sub-images via a sliding window and splice the results, we introduce a splice-then-segment paradigm that incorporates Segment-Anything Model (SAM) to tackle the resolution issue since SAM excels at extracting fine-grained semantic correlations from high-resolution images. Specifically, we introduce Trident, a training-free framework that first splices features extracted by CLIP and DINO from sub-images, then leverages SAM's encoder to create a correlation matrix for global aggregation, enabling a broadened receptive field for effective segmentation. Besides, we propose a refinement strategy for CLIP's coarse segmentation outputs by transforming them into prompts for SAM, further enhancing the segmentation performance. Trident achieves a significant improvement in the mIoU across eight benchmarks compared with the current SOTA, increasing from 44.4 to 48.6.Code is available at https://github.com/YuHengsss/Trident. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09219v1-abstract-full').style.display = 'none'; document.getElementById('2411.09219v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08244">arXiv:2411.08244</a> <span> [<a href="https://arxiv.org/pdf/2411.08244">pdf</a>, <a href="https://arxiv.org/format/2411.08244">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> NVCiM-PT: An NVCiM-assisted Prompt Tuning Framework for Edge LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+R">Ruiyang Qin</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+P">Pengyu Ren</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zheyu Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liu Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dancheng Liu</a>, <a href="/search/cs?searchtype=author&query=Nassereldine%2C+A">Amir Nassereldine</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Jinjun Xiong</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+K">Kai Ni</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Sharon Hu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yiyu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08244v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) deployed on edge devices, known as edge LLMs, need to continuously fine-tune their model parameters from user-generated data under limited resource constraints. However, most existing learning methods are not applicable for edge LLMs because of their reliance on high resources and low learning capacity. Prompt tuning (PT) has recently emerged as an effective fine-tunin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08244v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08244v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08244v1-abstract-full" style="display: none;"> Large Language Models (LLMs) deployed on edge devices, known as edge LLMs, need to continuously fine-tune their model parameters from user-generated data under limited resource constraints. However, most existing learning methods are not applicable for edge LLMs because of their reliance on high resources and low learning capacity. Prompt tuning (PT) has recently emerged as an effective fine-tuning method for edge LLMs by only modifying a small portion of LLM parameters, but it suffers from user domain shifts, resulting in repetitive training and losing resource efficiency. Conventional techniques to address domain shift issues often involve complex neural networks and sophisticated training, which are incompatible for PT for edge LLMs. Therefore, an open research question is how to address domain shift issues for edge LLMs with limited resources. In this paper, we propose a prompt tuning framework for edge LLMs, exploiting the benefits offered by non-volatile computing-in-memory (NVCiM) architectures. We introduce a novel NVCiM-assisted PT framework, where we narrow down the core operations to matrix-matrix multiplication, which can then be accelerated by performing in-situ computation on NVCiM. To the best of our knowledge, this is the first work employing NVCiM to improve the edge LLM PT performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08244v1-abstract-full').style.display = 'none'; document.getElementById('2411.08244v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by DATE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07762">arXiv:2411.07762</a> <span> [<a href="https://arxiv.org/pdf/2411.07762">pdf</a>, <a href="https://arxiv.org/format/2411.07762">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ASER: Activation Smoothing and Error Reconstruction for Large Language Model Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Weibo Zhao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yubin Shi</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xinyu Lyu</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+W">Wanchen Sui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shen Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07762v1-abstract-short" style="display: inline;"> Quantization stands as a pivotal technique for large language model (LLM) serving, yet it poses significant challenges particularly in achieving effective low-bit quantization. The limited numerical mapping makes the quantized model produce a non-trivial error, bringing out intolerable performance degration. This paper is anchored in the basic idea of model compression objectives, and delves into… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07762v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07762v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07762v1-abstract-full" style="display: none;"> Quantization stands as a pivotal technique for large language model (LLM) serving, yet it poses significant challenges particularly in achieving effective low-bit quantization. The limited numerical mapping makes the quantized model produce a non-trivial error, bringing out intolerable performance degration. This paper is anchored in the basic idea of model compression objectives, and delves into the layer-wise error distribution of LLMs during post-training quantization. Subsequently, we introduce ASER, an algorithm consisting of (1) Error Reconstruction: low-rank compensation for quantization error with LoRA-style matrices constructed by whitening SVD; (2) Activation Smoothing: outlier extraction to gain smooth activation and better error compensation. ASER is capable of quantizing typical LLMs to low-bit ones, particularly preserving accuracy even in W4A8 per-channel setup. Experimental results show that ASER is competitive among the state-of-the-art quantization algorithms, showing potential to activation quantization, with minor overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07762v1-abstract-full').style.display = 'none'; document.getElementById('2411.07762v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06686">arXiv:2411.06686</a> <span> [<a href="https://arxiv.org/pdf/2411.06686">pdf</a>, <a href="https://arxiv.org/format/2411.06686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SeedEdit: Align Image Re-Generation to Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yichun Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weilin Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06686v1-abstract-short" style="display: inline;"> We introduce SeedEdit, a diffusion model that is able to revise a given image with any text prompt. In our perspective, the key to such a task is to obtain an optimal balance between maintaining the original image, i.e. image reconstruction, and generating a new image, i.e. image re-generation. To this end, we start from a weak generator (text-to-image model) that creates diverse pairs between suc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06686v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06686v1-abstract-full" style="display: none;"> We introduce SeedEdit, a diffusion model that is able to revise a given image with any text prompt. In our perspective, the key to such a task is to obtain an optimal balance between maintaining the original image, i.e. image reconstruction, and generating a new image, i.e. image re-generation. To this end, we start from a weak generator (text-to-image model) that creates diverse pairs between such two directions and gradually align it into a strong image editor that well balances between the two tasks. SeedEdit can achieve more diverse and stable editing capability over prior image editing methods, enabling sequential revision over images generated by diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06686v1-abstract-full').style.display = 'none'; document.getElementById('2411.06686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our website: https://team.doubao.com/seededit</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04772">arXiv:2411.04772</a> <span> [<a href="https://arxiv.org/pdf/2411.04772">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Attention Masks Help Adversarial Attacks to Bypass Safety Detectors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yunfan Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04772v1-abstract-short" style="display: inline;"> Despite recent research advancements in adversarial attack methods, current approaches against XAI monitors are still discoverable and slower. In this paper, we present an adaptive framework for attention mask generation to enable stealthy, explainable and efficient PGD image classification adversarial attack under XAI monitors. Specifically, we utilize mutation XAI mixture and multitask self-supe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04772v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04772v1-abstract-full" style="display: none;"> Despite recent research advancements in adversarial attack methods, current approaches against XAI monitors are still discoverable and slower. In this paper, we present an adaptive framework for attention mask generation to enable stealthy, explainable and efficient PGD image classification adversarial attack under XAI monitors. Specifically, we utilize mutation XAI mixture and multitask self-supervised X-UNet for attention mask generation to guide PGD attack. Experiments on MNIST (MLP), CIFAR-10 (AlexNet) have shown that our system can outperform benchmark PGD, Sparsefool and SOTA SINIFGSM in balancing among stealth, efficiency and explainability which is crucial for effectively fooling SOTA defense protected classifiers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04772v1-abstract-full').style.display = 'none'; document.getElementById('2411.04772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03591">arXiv:2411.03591</a> <span> [<a href="https://arxiv.org/pdf/2411.03591">pdf</a>, <a href="https://arxiv.org/format/2411.03591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> vMF-Contact: Uncertainty-aware Evidential Learning for Probabilistic Contact-grasp in Noisy Clutter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yitian Shi</a>, <a href="/search/cs?searchtype=author&query=Welte%2C+E">Edgar Welte</a>, <a href="/search/cs?searchtype=author&query=Gilles%2C+M">Maximilian Gilles</a>, <a href="/search/cs?searchtype=author&query=Rayyes%2C+R">Rania Rayyes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03591v2-abstract-short" style="display: inline;"> Grasp learning in noisy environments, such as occlusions, sensor noise, and out-of-distribution (OOD) objects, poses significant challenges. Recent learning-based approaches focus primarily on capturing aleatoric uncertainty from inherent data noise. The epistemic uncertainty, which represents the OOD recognition, is often addressed by ensembles with multiple forward paths, limiting real-time appl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03591v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03591v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03591v2-abstract-full" style="display: none;"> Grasp learning in noisy environments, such as occlusions, sensor noise, and out-of-distribution (OOD) objects, poses significant challenges. Recent learning-based approaches focus primarily on capturing aleatoric uncertainty from inherent data noise. The epistemic uncertainty, which represents the OOD recognition, is often addressed by ensembles with multiple forward paths, limiting real-time application. In this paper, we propose an uncertainty-aware approach for 6-DoF grasp detection using evidential learning to comprehensively capture both uncertainties in real-world robotic grasping. As a key contribution, we introduce vMF-Contact, a novel architecture for learning hierarchical contact grasp representations with probabilistic modeling of directional uncertainty as von Mises-Fisher (vMF) distribution. To achieve this, we derive and analyze the theoretical formulation of the second-order objective on the posterior parametrization, providing formal guarantees for the model's ability to quantify uncertainty and improve grasp prediction performance. Moreover, we enhance feature expressiveness by applying partial point reconstructions as an auxiliary task, improving the comprehension of uncertainty quantification as well as the generalization to unseen objects. In the real-world experiments, our method demonstrates a significant improvement by 39% in the overall clearance rate compared to the baselines. Video is under https://www.youtube.com/watch?v=4aQsrDgdV8Y&t=12s <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03591v2-abstract-full').style.display = 'none'; document.getElementById('2411.03591v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03223">arXiv:2411.03223</a> <span> [<a href="https://arxiv.org/pdf/2411.03223">pdf</a>, <a href="https://arxiv.org/format/2411.03223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Grid Data: Exploring Graph Neural Networks for Earth Observation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shan Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaiyu Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zhitong Xiong</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yilei Shi</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sudipan Saha</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X+X">Xiao Xiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03223v2-abstract-short" style="display: inline;"> Earth Observation (EO) data analysis has been significantly revolutionized by deep learning (DL), with applications typically limited to grid-like data structures. Graph Neural Networks (GNNs) emerge as an important innovation, propelling DL into the non-Euclidean domain. Naturally, GNNs can effectively tackle the challenges posed by diverse modalities, multiple sensors, and the heterogeneous natu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03223v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03223v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03223v2-abstract-full" style="display: none;"> Earth Observation (EO) data analysis has been significantly revolutionized by deep learning (DL), with applications typically limited to grid-like data structures. Graph Neural Networks (GNNs) emerge as an important innovation, propelling DL into the non-Euclidean domain. Naturally, GNNs can effectively tackle the challenges posed by diverse modalities, multiple sensors, and the heterogeneous nature of EO data. To introduce GNNs in the related domains, our review begins by offering fundamental knowledge on GNNs. Then, we summarize the generic problems in EO, to which GNNs can offer potential solutions. Following this, we explore a broad spectrum of GNNs' applications to scientific problems in Earth systems, covering areas such as weather and climate analysis, disaster management, air quality monitoring, agriculture, land cover classification, hydrological process modeling, and urban modeling. The rationale behind adopting GNNs in these fields is explained, alongside methodologies for organizing graphs and designing favorable architectures for various tasks. Furthermore, we highlight methodological challenges of implementing GNNs in these domains and possible solutions that could guide future research. While acknowledging that GNNs are not a universal solution, we conclude the paper by comparing them with other popular architectures like transformers and analyzing their potential synergies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03223v2-abstract-full').style.display = 'none'; document.getElementById('2411.03223v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in Geoscience and Remote Sensing Magazine (GRSM)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00646">arXiv:2411.00646</a> <span> [<a href="https://arxiv.org/pdf/2411.00646">pdf</a>, <a href="https://arxiv.org/format/2411.00646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Phase Diagram of Vision Large Language Models Inference: A Perspective from Interaction across Image and Instruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+H">Houjing Wei</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+H">Hakaze Cho</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuting Shi</a>, <a href="/search/cs?searchtype=author&query=Inoue%2C+N">Naoya Inoue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00646v1-abstract-short" style="display: inline;"> Vision Large Language Models (VLLMs) usually take input as a concatenation of image token embeddings and text token embeddings and conduct causal modeling. However, their internal behaviors remain underexplored, raising the question of interaction among two types of tokens. To investigate such multimodal interaction during model inference, in this paper, we measure the contextualization among the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00646v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00646v1-abstract-full" style="display: none;"> Vision Large Language Models (VLLMs) usually take input as a concatenation of image token embeddings and text token embeddings and conduct causal modeling. However, their internal behaviors remain underexplored, raising the question of interaction among two types of tokens. To investigate such multimodal interaction during model inference, in this paper, we measure the contextualization among the hidden state vectors of tokens from different modalities. Our experiments uncover a four-phase inference dynamics of VLLMs against the depth of Transformer-based LMs, including (I) Alignment: In very early layers, contextualization emerges between modalities, suggesting a feature space alignment. (II) Intra-modal Encoding: In early layers, intra-modal contextualization is enhanced while inter-modal interaction is suppressed, suggesting a local encoding within modalities. (III) Inter-modal Encoding: In later layers, contextualization across modalities is enhanced, suggesting a deeper fusion across modalities. (IV) Output Preparation: In very late layers, contextualization is reduced globally, and hidden states are aligned towards the unembedding space. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00646v1-abstract-full').style.display = 'none'; document.getElementById('2411.00646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00473">arXiv:2411.00473</a> <span> [<a href="https://arxiv.org/pdf/2411.00473">pdf</a>, <a href="https://arxiv.org/format/2411.00473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Synergistic Interplay of Large Language Model and Digital Twin for Autonomous Optical Networks: Field Demonstrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuchen Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Anni Zhou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yan Shi</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shikui Shen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiongyan Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Danshi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00473v1-abstract-short" style="display: inline;"> The development of large language models (LLM) has revolutionized various fields and is anticipated to drive the advancement of autonomous systems. In the context of autonomous optical networks, creating a high-level cognitive agent in the control layer remains a challenge. However, LLM is primarily developed for natural language processing tasks, rendering them less effective in predicting the ph… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00473v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00473v1-abstract-full" style="display: none;"> The development of large language models (LLM) has revolutionized various fields and is anticipated to drive the advancement of autonomous systems. In the context of autonomous optical networks, creating a high-level cognitive agent in the control layer remains a challenge. However, LLM is primarily developed for natural language processing tasks, rendering them less effective in predicting the physical dynamics of optical communications. Moreover, optical networks demand rigorous stability, where direct deployment of strategies generated from LLM poses safety concerns. In this paper, a digital twin (DT)-enhanced LLM scheme is proposed to facilitate autonomous optical networks. By leveraging monitoring data and advanced models, the DT of optical networks can accurately characterize their physical dynamics, furnishing LLMs with dynamic-updated information for reliable decision-making. Prior to deployment, the generated strategies from LLM can be pre-verified in the DT platform, which also provides feedback to the LLM for further refinement of strategies. The synergistic interplay between DT and LLM for autonomous optical networks is demonstrated through three scenarios: performance optimization under dynamic loadings in an experimental C+L-band long-haul transmission link, protection switching for device upgrading in a field-deployed six-node mesh network, and performance recovery after fiber cuts in a field-deployed C+L-band transmission link. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00473v1-abstract-full').style.display = 'none'; document.getElementById('2411.00473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages,6 figures; Accepted by IEEE Communications Magazine, Open call</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00444">arXiv:2411.00444</a> <span> [<a href="https://arxiv.org/pdf/2411.00444">pdf</a>, <a href="https://arxiv.org/format/2411.00444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Expert-level protocol translation for self-driving labs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yu-Zhe Shi</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+F">Fanxu Meng</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+H">Haofei Hou</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Z">Zhangqian Bi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiao Xu</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+L">Lecheng Ruan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qining Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00444v1-abstract-short" style="display: inline;"> Recent development in Artificial Intelligence (AI) models has propelled their application in scientific discovery, but the validation and exploration of these discoveries require subsequent empirical experimentation. The concept of self-driving laboratories promises to automate and thus boost the experimental process following AI-driven discoveries. However, the transition of experimental protocol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00444v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00444v1-abstract-full" style="display: none;"> Recent development in Artificial Intelligence (AI) models has propelled their application in scientific discovery, but the validation and exploration of these discoveries require subsequent empirical experimentation. The concept of self-driving laboratories promises to automate and thus boost the experimental process following AI-driven discoveries. However, the transition of experimental protocols, originally crafted for human comprehension, into formats interpretable by machines presents significant challenges, which, within the context of specific expert domain, encompass the necessity for structured as opposed to natural language, the imperative for explicit rather than tacit knowledge, and the preservation of causality and consistency throughout protocol steps. Presently, the task of protocol translation predominantly requires the manual and labor-intensive involvement of domain experts and information technology specialists, rendering the process time-intensive. To address these issues, we propose a framework that automates the protocol translation process through a three-stage workflow, which incrementally constructs Protocol Dependence Graphs (PDGs) that approach structured on the syntax level, completed on the semantics level, and linked on the execution level. Quantitative and qualitative evaluations have demonstrated its performance at par with that of human experts, underscoring its potential to significantly expedite and democratize the process of scientific discovery by elevating the automation capabilities within self-driving laboratories. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00444v1-abstract-full').style.display = 'none'; document.getElementById('2411.00444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Advances in Neural Information Processing Systems (NeurIPS'24)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24219">arXiv:2410.24219</a> <span> [<a href="https://arxiv.org/pdf/2410.24219">pdf</a>, <a href="https://arxiv.org/format/2410.24219">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Motion in Text-to-Video Generation with Decomposed Encoding and Conditioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ruan%2C+P">Penghui Ruan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pichao Wang</a>, <a href="/search/cs?searchtype=author&query=Saxena%2C+D">Divya Saxena</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiannong Cao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuhui Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24219v1-abstract-short" style="display: inline;"> Despite advancements in Text-to-Video (T2V) generation, producing videos with realistic motion remains challenging. Current models often yield static or minimally dynamic outputs, failing to capture complex motions described by text. This issue stems from the internal biases in text encoding, which overlooks motions, and inadequate conditioning mechanisms in T2V generation models. To address this,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24219v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24219v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24219v1-abstract-full" style="display: none;"> Despite advancements in Text-to-Video (T2V) generation, producing videos with realistic motion remains challenging. Current models often yield static or minimally dynamic outputs, failing to capture complex motions described by text. This issue stems from the internal biases in text encoding, which overlooks motions, and inadequate conditioning mechanisms in T2V generation models. To address this, we propose a novel framework called DEcomposed MOtion (DEMO), which enhances motion synthesis in T2V generation by decomposing both text encoding and conditioning into content and motion components. Our method includes a content encoder for static elements and a motion encoder for temporal dynamics, alongside separate content and motion conditioning mechanisms. Crucially, we introduce text-motion and video-motion supervision to improve the model's understanding and generation of motion. Evaluations on benchmarks such as MSR-VTT, UCF-101, WebVid-10M, EvalCrafter, and VBench demonstrate DEMO's superior ability to produce videos with enhanced motion dynamics while maintaining high visual quality. Our approach significantly advances T2V generation by integrating comprehensive motion understanding directly from textual descriptions. Project page: https://PR-Ryan.github.io/DEMO-project/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24219v1-abstract-full').style.display = 'none'; document.getElementById('2410.24219v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024, code available at https://github.com/PR-Ryan/DEMO</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23805">arXiv:2410.23805</a> <span> [<a href="https://arxiv.org/pdf/2410.23805">pdf</a>, <a href="https://arxiv.org/format/2410.23805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> MemANNS: Enhancing Billion-Scale ANNS Efficiency with Practical PIM Hardware </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sitian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A+C">Amelie Chi Zhou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yucheng Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yusen Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+X">Xin Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23805v1-abstract-short" style="display: inline;"> In numerous production environments, Approximate Nearest Neighbor Search (ANNS) plays an indispensable role, particularly when dealing with massive datasets that can contain billions of entries. The necessity for rapid response times in these applications makes the efficiency of ANNS algorithms crucial. However, traditional ANNS approaches encounter substantial challenges at the billion-scale leve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23805v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23805v1-abstract-full" style="display: none;"> In numerous production environments, Approximate Nearest Neighbor Search (ANNS) plays an indispensable role, particularly when dealing with massive datasets that can contain billions of entries. The necessity for rapid response times in these applications makes the efficiency of ANNS algorithms crucial. However, traditional ANNS approaches encounter substantial challenges at the billion-scale level. CPU-based methods are hindered by the limitations of memory bandwidth, while GPU-based methods struggle with memory capacity and resource utilization efficiency. This paper introduces MemANNS, an innovative framework that utilizes UPMEM PIM architecture to address the memory bottlenecks in ANNS algorithms at scale. We concentrate on optimizing a well-known ANNS algorithm, IVFPQ, for PIM hardware through several techniques. First, we introduce an architecture-aware strategy for data placement and query scheduling that ensures an even distribution of workload across PIM chips, thereby maximizing the use of aggregated memory bandwidth. Additionally, we have developed an efficient thread scheduling mechanism that capitalizes on PIM's multi-threading capabilities and enhances memory management to boost cache efficiency. Moreover, we have recognized that real-world datasets often feature vectors with frequently co-occurring items. To address this, we propose a novel encoding method for IVFPQ that minimizes memory accesses during query processing. Our comprehensive evaluation using actual PIM hardware and real-world datasets at the billion-scale, show that MemANNS offers a significant 4.3x increase in QPS over CPU-based Faiss, and it matches the performance of GPU-based Faiss implementations. Additionally, MemANNS improves energy efficiency, with a 2.3x enhancement in QPS/Watt compared to GPU solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23805v1-abstract-full').style.display = 'none'; document.getElementById('2410.23805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23775">arXiv:2410.23775</a> <span> [<a href="https://arxiv.org/pdf/2410.23775">pdf</a>, <a href="https://arxiv.org/format/2410.23775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> In-Context LoRA for Diffusion Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lianghua Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhi-Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yupeng Shi</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+H">Huanzhang Dou</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chen Liang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yutong Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23775v3-abstract-short" style="display: inline;"> Recent research arXiv:2410.15027 has explored the use of diffusion transformers (DiTs) for task-agnostic image generation by simply concatenating attention tokens across images. However, despite substantial computational resources, the fidelity of the generated images remains suboptimal. In this study, we reevaluate and streamline this framework by hypothesizing that text-to-image DiTs inherently… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23775v3-abstract-full').style.display = 'inline'; document.getElementById('2410.23775v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23775v3-abstract-full" style="display: none;"> Recent research arXiv:2410.15027 has explored the use of diffusion transformers (DiTs) for task-agnostic image generation by simply concatenating attention tokens across images. However, despite substantial computational resources, the fidelity of the generated images remains suboptimal. In this study, we reevaluate and streamline this framework by hypothesizing that text-to-image DiTs inherently possess in-context generation capabilities, requiring only minimal tuning to activate them. Through diverse task experiments, we qualitatively demonstrate that existing text-to-image DiTs can effectively perform in-context generation without any tuning. Building on this insight, we propose a remarkably simple pipeline to leverage the in-context abilities of DiTs: (1) concatenate images instead of tokens, (2) perform joint captioning of multiple images, and (3) apply task-specific LoRA tuning using small datasets (e.g., 20~100 samples) instead of full-parameter tuning with large datasets. We name our models In-Context LoRA (IC-LoRA). This approach requires no modifications to the original DiT models, only changes to the training data. Remarkably, our pipeline generates high-fidelity image sets that better adhere to prompts. While task-specific in terms of tuning data, our framework remains task-agnostic in architecture and pipeline, offering a powerful tool for the community and providing valuable insights for further research on product-level task-agnostic generation systems. We release our code, data, and models at https://github.com/ali-vilab/In-Context-LoRA <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23775v3-abstract-full').style.display = 'none'; document.getElementById('2410.23775v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech report. Project page: https://ali-vilab.github.io/In-Context-LoRA-Page/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23391">arXiv:2410.23391</a> <span> [<a href="https://arxiv.org/pdf/2410.23391">pdf</a>, <a href="https://arxiv.org/format/2410.23391">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Understanding Representation of Deep Equilibrium Models from Neural Collapse Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haixiang Sun</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Ye Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23391v1-abstract-short" style="display: inline;"> Deep Equilibrium Model (DEQ), which serves as a typical implicit neural network, emphasizes their memory efficiency and competitive performance compared to explicit neural networks. However, there has been relatively limited theoretical analysis on the representation of DEQ. In this paper, we utilize the Neural Collapse ($\mathcal{NC}$) as a tool to systematically analyze the representation of DEQ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23391v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23391v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23391v1-abstract-full" style="display: none;"> Deep Equilibrium Model (DEQ), which serves as a typical implicit neural network, emphasizes their memory efficiency and competitive performance compared to explicit neural networks. However, there has been relatively limited theoretical analysis on the representation of DEQ. In this paper, we utilize the Neural Collapse ($\mathcal{NC}$) as a tool to systematically analyze the representation of DEQ under both balanced and imbalanced conditions. $\mathcal{NC}$ is an interesting phenomenon in the neural network training process that characterizes the geometry of class features and classifier weights. While extensively studied in traditional explicit neural networks, the $\mathcal{NC}$ phenomenon has not received substantial attention in the context of implicit neural networks. We theoretically show that $\mathcal{NC}$ exists in DEQ under balanced conditions. Moreover, in imbalanced settings, despite the presence of minority collapse, DEQ demonstrated advantages over explicit neural networks. These advantages include the convergence of extracted features to the vertices of a simplex equiangular tight frame and self-duality properties under mild conditions, highlighting DEQ's superiority in handling imbalanced datasets. Finally, we validate our theoretical analyses through experiments in both balanced and imbalanced scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23391v1-abstract-full').style.display = 'none'; document.getElementById('2410.23391v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23073">arXiv:2410.23073</a> <span> [<a href="https://arxiv.org/pdf/2410.23073">pdf</a>, <a href="https://arxiv.org/format/2410.23073">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> RSNet: A Light Framework for The Detection of Multi-scale Remote Sensing Targets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chengcheng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuhu Shi</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Weiming Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23073v3-abstract-short" style="display: inline;"> Recent advancements in synthetic aperture radar (SAR) ship detection using deep learning have significantly improved accuracy and speed, yet effectively detecting small objects in complex backgrounds with fewer parameters remains a challenge. This letter introduces RSNet, a lightweight framework constructed to enhance ship detection in SAR imagery. To ensure accuracy with fewer parameters, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23073v3-abstract-full').style.display = 'inline'; document.getElementById('2410.23073v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23073v3-abstract-full" style="display: none;"> Recent advancements in synthetic aperture radar (SAR) ship detection using deep learning have significantly improved accuracy and speed, yet effectively detecting small objects in complex backgrounds with fewer parameters remains a challenge. This letter introduces RSNet, a lightweight framework constructed to enhance ship detection in SAR imagery. To ensure accuracy with fewer parameters, we proposed Waveletpool-ContextGuided (WCG) as its backbone, guiding global context understanding through multi-scale wavelet features for effective detection in complex scenes. Additionally, Waveletpool-StarFusion (WSF) is introduced as the neck, employing a residual wavelet element-wise multiplication structure to achieve higher dimensional nonlinear features without increasing network width. The Lightweight-Shared (LS) module is designed as detect components to achieve efficient detection through lightweight shared convolutional structure and multi-format compatibility. Experiments on the SAR Ship Detection Dataset (SSDD) and High-Resolution SAR Image Dataset (HRSID) demonstrate that RSNet achieves a strong balance between lightweight design and detection performance, surpassing many state-of-the-art detectors, reaching 72.5\% and 67.6\% in \textbf{$\mathbf{mAP_{.50:.95}}$ }respectively with 1.49M parameters. Our code will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23073v3-abstract-full').style.display = 'none'; document.getElementById('2410.23073v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22057">arXiv:2410.22057</a> <span> [<a href="https://arxiv.org/pdf/2410.22057">pdf</a>, <a href="https://arxiv.org/format/2410.22057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FANCL: Feature-Guided Attention Network with Curriculum Learning for Brain Metastases Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zijiang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyu Liu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yonghong Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22057v1-abstract-short" style="display: inline;"> Accurate segmentation of brain metastases (BMs) in MR image is crucial for the diagnosis and follow-up of patients. Methods based on deep convolutional neural networks (CNNs) have achieved high segmentation performance. However, due to the loss of critical feature information caused by convolutional and pooling operations, CNNs still face great challenges in small BMs segmentation. Besides, BMs ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22057v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22057v1-abstract-full" style="display: none;"> Accurate segmentation of brain metastases (BMs) in MR image is crucial for the diagnosis and follow-up of patients. Methods based on deep convolutional neural networks (CNNs) have achieved high segmentation performance. However, due to the loss of critical feature information caused by convolutional and pooling operations, CNNs still face great challenges in small BMs segmentation. Besides, BMs are irregular and easily confused with healthy tissues, which makes it difficult for the model to effectively learn tumor structure during training. To address these issues, this paper proposes a novel model called feature-guided attention network with curriculum learning (FANCL). Based on CNNs, FANCL utilizes the input image and its feature to establish the intrinsic connections between metastases of different sizes, which can effectively compensate for the loss of high-level feature from small tumors with the information of large tumors. Furthermore, FANCL applies the voxel-level curriculum learning strategy to help the model gradually learn the structure and details of BMs. And baseline models of varying depths are employed as curriculum-mining networks for organizing the curriculum progression. The evaluation results on the BraTS-METS 2023 dataset indicate that FANCL significantly improves the segmentation performance, confirming the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22057v1-abstract-full').style.display = 'none'; document.getElementById('2410.22057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19702">arXiv:2410.19702</a> <span> [<a href="https://arxiv.org/pdf/2410.19702">pdf</a>, <a href="https://arxiv.org/format/2410.19702">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> TimeSuite: Improving MLLMs for Long Video Understanding via Grounded Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xiangyu Zeng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kunchang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenting Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinhao Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+T">Tianxiang Jiang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Ziang Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Songze Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yansong Shi</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Z">Zhengrong Yue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yali Wang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Limin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19702v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have demonstrated impressive performance in short video understanding. However, understanding long-form videos still remains challenging for MLLMs. This paper proposes TimeSuite, a collection of new designs to adapt the existing short-form video MLLMs for long video understanding, including a simple yet efficient framework to process long video sequence, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19702v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19702v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19702v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have demonstrated impressive performance in short video understanding. However, understanding long-form videos still remains challenging for MLLMs. This paper proposes TimeSuite, a collection of new designs to adapt the existing short-form video MLLMs for long video understanding, including a simple yet efficient framework to process long video sequence, a high-quality video dataset for grounded tuning of MLLMs, and a carefully-designed instruction tuning task to explicitly incorporate the grounding supervision in the traditional QA format. Specifically, based on VideoChat, we propose our long-video MLLM, coined as VideoChat-T, by implementing a token shuffling to compress long video tokens and introducing Temporal Adaptive Position Encoding (TAPE) to enhance the temporal awareness of visual representation. Meanwhile, we introduce the TimePro, a comprehensive grounding-centric instruction tuning dataset composed of 9 tasks and 349k high-quality grounded annotations. Notably, we design a new instruction tuning task type, called Temporal Grounded Caption, to peform detailed video descriptions with the corresponding time stamps prediction. This explicit temporal location prediction will guide MLLM to correctly attend on the visual content when generating description, and thus reduce the hallucination risk caused by the LLMs. Experimental results demonstrate that our TimeSuite provides a successful solution to enhance the long video understanding capability of short-form MLLM, achieving improvement of 5.6% and 6.8% on the benchmarks of Egoschema and VideoMME, respectively. In addition, VideoChat-T exhibits robust zero-shot temporal grounding capabilities, significantly outperforming the existing state-of-the-art MLLMs. After fine-tuning, it performs on par with the traditional supervised expert models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19702v1-abstract-full').style.display = 'none'; document.getElementById('2410.19702v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19432">arXiv:2410.19432</a> <span> [<a href="https://arxiv.org/pdf/2410.19432">pdf</a>, <a href="https://arxiv.org/format/2410.19432">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Image-Based Visual Servoing for Enhanced Cooperation of Dual-Arm Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zizhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuan Yang</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+W">Wenqiang Zuo</a>, <a href="/search/cs?searchtype=author&query=Song%2C+G">Guangming Song</a>, <a href="/search/cs?searchtype=author&query=Song%2C+A">Aiguo Song</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yang Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19432v2-abstract-short" style="display: inline;"> The cooperation of a pair of robot manipulators is required to manipulate a target object without any fixtures. The conventional control methods coordinate the end-effector pose of each manipulator with that of the other using their kinematics and joint coordinate measurements. Yet, the manipulators' inaccurate kinematics and joint coordinate measurements can cause significant pose synchronization… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19432v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19432v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19432v2-abstract-full" style="display: none;"> The cooperation of a pair of robot manipulators is required to manipulate a target object without any fixtures. The conventional control methods coordinate the end-effector pose of each manipulator with that of the other using their kinematics and joint coordinate measurements. Yet, the manipulators' inaccurate kinematics and joint coordinate measurements can cause significant pose synchronization errors in practice. This paper thus proposes an image-based visual servoing approach for enhancing the cooperation of a dual-arm manipulation system. On top of the classical control, the visual servoing controller lets each manipulator use its carried camera to measure the image features of the other's marker and adapt its end-effector pose with the counterpart on the move. Because visual measurements are robust to kinematic errors, the proposed control can reduce the end-effector pose synchronization errors and the fluctuations of the interaction forces of the pair of manipulators on the move. Theoretical analyses have rigorously proven the stability of the closed-loop system. Comparative experiments on real robots have substantiated the effectiveness of the proposed control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19432v2-abstract-full').style.display = 'none'; document.getElementById('2410.19432v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures. Corresponding author: Yuan Yang (yuan_evan_yang@seu.edu.cn). For associated video file, see https://zizhe.io/assets/d16d4124b851e10a9db1775ed4a4ece9.mp4 This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18786">arXiv:2410.18786</a> <span> [<a href="https://arxiv.org/pdf/2410.18786">pdf</a>, <a href="https://arxiv.org/format/2410.18786">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Applying Neural Monte Carlo Tree Search to Unsignalized Multi-intersection Scheduling for Autonomous Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yucheng Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenlong Wang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xiaowen Tao</a>, <a href="/search/cs?searchtype=author&query=Dusparic%2C+I">Ivana Dusparic</a>, <a href="/search/cs?searchtype=author&query=Cahill%2C+V">Vinny Cahill</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18786v1-abstract-short" style="display: inline;"> Dynamic scheduling of access to shared resources by autonomous systems is a challenging problem, characterized as being NP-hard. The complexity of this task leads to a combinatorial explosion of possibilities in highly dynamic systems where arriving requests must be continuously scheduled subject to strong safety and time constraints. An example of such a system is an unsignalized intersection, wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18786v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18786v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18786v1-abstract-full" style="display: none;"> Dynamic scheduling of access to shared resources by autonomous systems is a challenging problem, characterized as being NP-hard. The complexity of this task leads to a combinatorial explosion of possibilities in highly dynamic systems where arriving requests must be continuously scheduled subject to strong safety and time constraints. An example of such a system is an unsignalized intersection, where automated vehicles' access to potential conflict zones must be dynamically scheduled. In this paper, we apply Neural Monte Carlo Tree Search (NMCTS) to the challenging task of scheduling platoons of vehicles crossing unsignalized intersections. Crucially, we introduce a transformation model that maps successive sequences of potentially conflicting road-space reservation requests from platoons of vehicles into a series of board-game-like problems and use NMCTS to search for solutions representing optimal road-space allocation schedules in the context of past allocations. To optimize search, we incorporate a prioritized re-sampling method with parallel NMCTS (PNMCTS) to improve the quality of training data. To optimize training, a curriculum learning strategy is used to train the agent to schedule progressively more complex boards culminating in overlapping boards that represent busy intersections. In a busy single four-way unsignalized intersection simulation, PNMCTS solved 95\% of unseen scenarios, reducing crossing time by 43\% in light and 52\% in heavy traffic versus first-in, first-out control. In a 3x3 multi-intersection network, the proposed method maintained free-flow in light traffic when all intersections are under control of PNMCTS and outperformed state-of-the-art RL-based traffic-light controllers in average travel time by 74.5\% and total throughput by 16\% in heavy traffic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18786v1-abstract-full').style.display = 'none'; document.getElementById('2410.18786v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17547">arXiv:2410.17547</a> <span> [<a href="https://arxiv.org/pdf/2410.17547">pdf</a>, <a href="https://arxiv.org/format/2410.17547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Motion Planning via Operator Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Matada%2C+S">Sharath Matada</a>, <a href="/search/cs?searchtype=author&query=Bhan%2C+L">Luke Bhan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuanyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Atanasov%2C+N">Nikolay Atanasov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17547v1-abstract-short" style="display: inline;"> In this work, we introduce a planning neural operator (PNO) for predicting the value function of a motion planning problem. We recast value function approximation as learning a single operator from the cost function space to the value function space, which is defined by an Eikonal partial differential equation (PDE). Specifically, we recast computing value functions as learning a single operator a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17547v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17547v1-abstract-full" style="display: none;"> In this work, we introduce a planning neural operator (PNO) for predicting the value function of a motion planning problem. We recast value function approximation as learning a single operator from the cost function space to the value function space, which is defined by an Eikonal partial differential equation (PDE). Specifically, we recast computing value functions as learning a single operator across continuous function spaces which prove is equivalent to solving an Eikonal PDE. Through this reformulation, our learned PNO is able to generalize to new motion planning problems without retraining. Therefore, our PNO model, despite being trained with a finite number of samples at coarse resolution, inherits the zero-shot super-resolution property of neural operators. We demonstrate accurate value function approximation at 16 times the training resolution on the MovingAI lab's 2D city dataset and compare with state-of-the-art neural value function predictors on 3D scenes from the iGibson building dataset. Lastly, we investigate employing the value function output of PNO as a heuristic function to accelerate motion planning. We show theoretically that the PNO heuristic is $蔚$-consistent by introducing an inductive bias layer that guarantees our value functions satisfy the triangle inequality. With our heuristic, we achieve a 30% decrease in nodes visited while obtaining near optimal path lengths on the MovingAI lab 2D city dataset, compared to classical planning methods (A*, RRT*). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17547v1-abstract-full').style.display = 'none'; document.getElementById('2410.17547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17395">arXiv:2410.17395</a> <span> [<a href="https://arxiv.org/pdf/2410.17395">pdf</a>, <a href="https://arxiv.org/format/2410.17395">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3658617.3698479">10.1145/3658617.3698479 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A 10.60 $渭$W 150 GOPS Mixed-Bit-Width Sparse CNN Accelerator for Life-Threatening Ventricular Arrhythmia Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yifan Qin</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Z">Zhenge Jia</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zheyu Yan</a>, <a href="/search/cs?searchtype=author&query=Mok%2C+J">Jay Mok</a>, <a href="/search/cs?searchtype=author&query=Yung%2C+M">Manto Yung</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuejiao Liu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+W">Wujie Wen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Luhong Liang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+K+T">Kwang-Ting Tim Cheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X+S">X. Sharon Hu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yiyu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17395v1-abstract-short" style="display: inline;"> This paper proposes an ultra-low power, mixed-bit-width sparse convolutional neural network (CNN) accelerator to accelerate ventricular arrhythmia (VA) detection. The chip achieves 50% sparsity in a quantized 1D CNN using a sparse processing element (SPE) architecture. Measurement on the prototype chip TSMC 40nm CMOS low-power (LP) process for the VA classification task demonstrates that it consum… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17395v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17395v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17395v1-abstract-full" style="display: none;"> This paper proposes an ultra-low power, mixed-bit-width sparse convolutional neural network (CNN) accelerator to accelerate ventricular arrhythmia (VA) detection. The chip achieves 50% sparsity in a quantized 1D CNN using a sparse processing element (SPE) architecture. Measurement on the prototype chip TSMC 40nm CMOS low-power (LP) process for the VA classification task demonstrates that it consumes 10.60 $渭$W of power while achieving a performance of 150 GOPS and a diagnostic accuracy of 99.95%. The computation power density is only 0.57 $渭$W/mm$^2$, which is 14.23X smaller than state-of-the-art works, making it highly suitable for implantable and wearable medical devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17395v1-abstract-full').style.display = 'none'; document.getElementById('2410.17395v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 pages, accepted to The 30th Asia and South Pacific Design Automation Conference (ASP-DAC 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16843">arXiv:2410.16843</a> <span> [<a href="https://arxiv.org/pdf/2410.16843">pdf</a>, <a href="https://arxiv.org/format/2410.16843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Trustworthy Alignment of Retrieval-Augmented Large Language Models via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zongmeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yufeng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinhua Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wengang Zhou</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xiang Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Houqiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16843v1-abstract-short" style="display: inline;"> Trustworthiness is an essential prerequisite for the real-world application of large language models. In this paper, we focus on the trustworthiness of language models with respect to retrieval augmentation. Despite being supported with external evidence, retrieval-augmented generation still suffers from hallucinations, one primary cause of which is the conflict between contextual and parametric k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16843v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16843v1-abstract-full" style="display: none;"> Trustworthiness is an essential prerequisite for the real-world application of large language models. In this paper, we focus on the trustworthiness of language models with respect to retrieval augmentation. Despite being supported with external evidence, retrieval-augmented generation still suffers from hallucinations, one primary cause of which is the conflict between contextual and parametric knowledge. We deem that retrieval-augmented language models have the inherent capabilities of supplying response according to both contextual and parametric knowledge. Inspired by aligning language models with human preference, we take the first step towards aligning retrieval-augmented language models to a status where it responds relying merely on the external evidence and disregards the interference of parametric knowledge. Specifically, we propose a reinforcement learning based algorithm Trustworthy-Alignment, theoretically and experimentally demonstrating large language models' capability of reaching a trustworthy status without explicit supervision on how to respond. Our work highlights the potential of large language models on exploring its intrinsic abilities by its own and expands the application scenarios of alignment from fulfilling human preference to creating trustworthy agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16843v1-abstract-full').style.display = 'none'; document.getElementById('2410.16843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 41st International Conference on Machine Learning, PMLR 235:59827-59850, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16020">arXiv:2410.16020</a> <span> [<a href="https://arxiv.org/pdf/2410.16020">pdf</a>, <a href="https://arxiv.org/format/2410.16020">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> START: A Generalized State Space Model with Saliency-Driven Token-Aware Transformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jintao Guo</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+L">Lei Qi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yinghuan Shi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16020v1-abstract-short" style="display: inline;"> Domain Generalization (DG) aims to enable models to generalize to unseen target domains by learning from multiple source domains. Existing DG methods primarily rely on convolutional neural networks (CNNs), which inherently learn texture biases due to their limited receptive fields, making them prone to overfitting source domains. While some works have introduced transformer-based methods (ViTs) fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16020v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16020v1-abstract-full" style="display: none;"> Domain Generalization (DG) aims to enable models to generalize to unseen target domains by learning from multiple source domains. Existing DG methods primarily rely on convolutional neural networks (CNNs), which inherently learn texture biases due to their limited receptive fields, making them prone to overfitting source domains. While some works have introduced transformer-based methods (ViTs) for DG to leverage the global receptive field, these methods incur high computational costs due to the quadratic complexity of self-attention. Recently, advanced state space models (SSMs), represented by Mamba, have shown promising results in supervised learning tasks by achieving linear complexity in sequence length during training and fast RNN-like computation during inference. Inspired by this, we investigate the generalization ability of the Mamba model under domain shifts and find that input-dependent matrices within SSMs could accumulate and amplify domain-specific features, thus hindering model generalization. To address this issue, we propose a novel SSM-based architecture with saliency-based token-aware transformation (namely START), which achieves state-of-the-art (SOTA) performances and offers a competitive alternative to CNNs and ViTs. Our START can selectively perturb and suppress domain-specific features in salient tokens within the input-dependent matrices of SSMs, thus effectively reducing the discrepancy between different domains. Extensive experiments on five benchmarks demonstrate that START outperforms existing SOTA DG methods with efficient linear complexity. Our code is available at https://github.com/lingeringlight/START. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16020v1-abstract-full').style.display = 'none'; document.getElementById('2410.16020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS2024. The code is available at https://github.com/lingeringlight/START</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15027">arXiv:2410.15027</a> <span> [<a href="https://arxiv.org/pdf/2410.15027">pdf</a>, <a href="https://arxiv.org/format/2410.15027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Group Diffusion Transformers are Unsupervised Multitask Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lianghua Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhi-Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+H">Huanzhang Dou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yupeng Shi</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yutong Feng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chen Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15027v1-abstract-short" style="display: inline;"> While large language models (LLMs) have revolutionized natural language processing with their task-agnostic capabilities, visual generation tasks such as image translation, style transfer, and character customization still rely heavily on supervised, task-specific datasets. In this work, we introduce Group Diffusion Transformers (GDTs), a novel framework that unifies diverse visual generation task… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15027v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15027v1-abstract-full" style="display: none;"> While large language models (LLMs) have revolutionized natural language processing with their task-agnostic capabilities, visual generation tasks such as image translation, style transfer, and character customization still rely heavily on supervised, task-specific datasets. In this work, we introduce Group Diffusion Transformers (GDTs), a novel framework that unifies diverse visual generation tasks by redefining them as a group generation problem. In this approach, a set of related images is generated simultaneously, optionally conditioned on a subset of the group. GDTs build upon diffusion transformers with minimal architectural modifications by concatenating self-attention tokens across images. This allows the model to implicitly capture cross-image relationships (e.g., identities, styles, layouts, surroundings, and color schemes) through caption-based correlations. Our design enables scalable, unsupervised, and task-agnostic pretraining using extensive collections of image groups sourced from multimodal internet articles, image galleries, and video frames. We evaluate GDTs on a comprehensive benchmark featuring over 200 instructions across 30 distinct visual generation tasks, including picture book creation, font design, style transfer, sketching, colorization, drawing sequence generation, and character customization. Our models achieve competitive zero-shot performance without any additional fine-tuning or gradient updates. Furthermore, ablation studies confirm the effectiveness of key components such as data scaling, group size, and model design. These results demonstrate the potential of GDTs as scalable, general-purpose visual generation systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15027v1-abstract-full').style.display = 'none'; document.getElementById('2410.15027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13618">arXiv:2410.13618</a> <span> [<a href="https://arxiv.org/pdf/2410.13618">pdf</a>, <a href="https://arxiv.org/format/2410.13618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LoLDU: Low-Rank Adaptation via Lower-Diag-Upper Decomposition for Parameter-Efficient Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yiming Shi</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jiwei Wei</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yujia Wu</a>, <a href="/search/cs?searchtype=author&query=Ran%2C+R">Ran Ran</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chengwei Sun</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shiyuan He</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13618v1-abstract-short" style="display: inline;"> The rapid growth of model scale has necessitated substantial computational resources for fine-tuning. Existing approach such as Low-Rank Adaptation (LoRA) has sought to address the problem of handling the large updated parameters in full fine-tuning. However, LoRA utilize random initialization and optimization of low-rank matrices to approximate updated weights, which can result in suboptimal conv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13618v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13618v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13618v1-abstract-full" style="display: none;"> The rapid growth of model scale has necessitated substantial computational resources for fine-tuning. Existing approach such as Low-Rank Adaptation (LoRA) has sought to address the problem of handling the large updated parameters in full fine-tuning. However, LoRA utilize random initialization and optimization of low-rank matrices to approximate updated weights, which can result in suboptimal convergence and an accuracy gap compared to full fine-tuning. To address these issues, we propose LoLDU, a Parameter-Efficient Fine-Tuning (PEFT) approach that significantly reduces trainable parameters by 2600 times compared to regular PEFT methods while maintaining comparable performance. LoLDU leverages Lower-Diag-Upper Decomposition (LDU) to initialize low-rank matrices for faster convergence and orthogonality. We focus on optimizing the diagonal matrix for scaling transformations. To the best of our knowledge, LoLDU has the fewest parameters among all PEFT approaches. We conducted extensive experiments across 4 instruction-following datasets, 6 natural language understanding (NLU) datasets, 8 image classification datasets, and image generation datasets with multiple model types (LLaMA2, RoBERTa, ViT, and Stable Diffusion), providing a comprehensive and detailed analysis. Our open-source code can be accessed at \href{https://github.com/SKDDJ/LoLDU}{https://github.com/SKDDJ/LoLDU}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13618v1-abstract-full').style.display = 'none'; document.getElementById('2410.13618v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13195">arXiv:2410.13195</a> <span> [<a href="https://arxiv.org/pdf/2410.13195">pdf</a>, <a href="https://arxiv.org/format/2410.13195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UniG: Modelling Unitary 3D Gaussians for View-consistent 3D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiamin Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kenkun Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yukai Shi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaoke Jiang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13195v2-abstract-short" style="display: inline;"> In this work, we present UniG, a view-consistent 3D reconstruction and novel view synthesis model that generates a high-fidelity representation of 3D Gaussians from sparse images. Existing 3D Gaussians-based methods usually regress Gaussians per-pixel of each view, create 3D Gaussians per view separately, and merge them through point concatenation. Such a view-independent reconstruction approach o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13195v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13195v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13195v2-abstract-full" style="display: none;"> In this work, we present UniG, a view-consistent 3D reconstruction and novel view synthesis model that generates a high-fidelity representation of 3D Gaussians from sparse images. Existing 3D Gaussians-based methods usually regress Gaussians per-pixel of each view, create 3D Gaussians per view separately, and merge them through point concatenation. Such a view-independent reconstruction approach often results in a view inconsistency issue, where the predicted positions of the same 3D point from different views may have discrepancies. To address this problem, we develop a DETR (DEtection TRansformer)-like framework, which treats 3D Gaussians as decoder queries and updates their parameters layer by layer by performing multi-view cross-attention (MVDFA) over multiple input images. In this way, multiple views naturally contribute to modeling a unitary representation of 3D Gaussians, thereby making 3D reconstruction more view-consistent. Moreover, as the number of 3D Gaussians used as decoder queries is irrespective of the number of input views, allow an arbitrary number of input images without causing memory explosion. Extensive experiments validate the advantages of our approach, showcasing superior performance over existing methods quantitatively (improving PSNR by 4.2 dB when trained on Objaverse and tested on the GSO benchmark) and qualitatively. The code will be released at https://github.com/jwubz123/UNIG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13195v2-abstract-full').style.display = 'none'; document.getElementById('2410.13195v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11509">arXiv:2410.11509</a> <span> [<a href="https://arxiv.org/pdf/2410.11509">pdf</a>, <a href="https://arxiv.org/format/2410.11509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dual-Teacher Ensemble Models with Double-Copy-Paste for 3D Semi-Supervised Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fa%2C+Z">Zhan Fa</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shumeng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+L">Lei Qi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qian Yu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yinghuan Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11509v1-abstract-short" style="display: inline;"> Semi-supervised learning (SSL) techniques address the high labeling costs in 3D medical image segmentation, with the teacher-student model being a common approach. However, using an exponential moving average (EMA) in single-teacher models may cause coupling issues, where the weights of the student and teacher models become similar, limiting the teacher's ability to provide additional knowledge fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11509v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11509v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11509v1-abstract-full" style="display: none;"> Semi-supervised learning (SSL) techniques address the high labeling costs in 3D medical image segmentation, with the teacher-student model being a common approach. However, using an exponential moving average (EMA) in single-teacher models may cause coupling issues, where the weights of the student and teacher models become similar, limiting the teacher's ability to provide additional knowledge for the student. Dual-teacher models were introduced to address this problem but often neglected the importance of maintaining teacher model diversity, leading to coupling issues among teachers. To address the coupling issue, we incorporate a double-copy-paste (DCP) technique to enhance the diversity among the teachers. Additionally, we introduce the Staged Selective Ensemble (SSE) module, which selects different ensemble methods based on the characteristics of the samples and enables more accurate segmentation of label boundaries, thereby improving the quality of pseudo-labels. Experimental results demonstrate the effectiveness of our proposed method in 3D medical image segmentation tasks. Here is the code link: https://github.com/Fazhan-cs/DCP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11509v1-abstract-full').style.display = 'none'; document.getElementById('2410.11509v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T05 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.5.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10934">arXiv:2410.10934</a> <span> [<a href="https://arxiv.org/pdf/2410.10934">pdf</a>, <a href="https://arxiv.org/format/2410.10934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Agent-as-a-Judge: Evaluate Agents with Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuge%2C+M">Mingchen Zhuge</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Changsheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Ashley%2C+D">Dylan Ashley</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenyi Wang</a>, <a href="/search/cs?searchtype=author&query=Khizbullin%2C+D">Dmitrii Khizbullin</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yunyang Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zechun Liu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+E">Ernie Chang</a>, <a href="/search/cs?searchtype=author&query=Krishnamoorthi%2C+R">Raghuraman Krishnamoorthi</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuandong Tian</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/cs?searchtype=author&query=Chandra%2C+V">Vikas Chandra</a>, <a href="/search/cs?searchtype=author&query=Schmidhuber%2C+J">J眉rgen Schmidhuber</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10934v2-abstract-short" style="display: inline;"> Contemporary evaluation techniques are inadequate for agentic systems. These approaches either focus exclusively on final outcomes -- ignoring the step-by-step nature of agentic systems, or require excessive manual labour. To address this, we introduce the Agent-as-a-Judge framework, wherein agentic systems are used to evaluate agentic systems. This is an organic extension of the LLM-as-a-Judge fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10934v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10934v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10934v2-abstract-full" style="display: none;"> Contemporary evaluation techniques are inadequate for agentic systems. These approaches either focus exclusively on final outcomes -- ignoring the step-by-step nature of agentic systems, or require excessive manual labour. To address this, we introduce the Agent-as-a-Judge framework, wherein agentic systems are used to evaluate agentic systems. This is an organic extension of the LLM-as-a-Judge framework, incorporating agentic features that enable intermediate feedback for the entire task-solving process. We apply the Agent-as-a-Judge to the task of code generation. To overcome issues with existing benchmarks and provide a proof-of-concept testbed for Agent-as-a-Judge, we present DevAI, a new benchmark of 55 realistic automated AI development tasks. It includes rich manual annotations, like a total of 365 hierarchical user requirements. We benchmark three of the popular agentic systems using Agent-as-a-Judge and find it dramatically outperforms LLM-as-a-Judge and is as reliable as our human evaluation baseline. Altogether, we believe that Agent-as-a-Judge marks a concrete step forward for modern agentic systems -- by providing rich and reliable reward signals necessary for dynamic and scalable self-improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10934v2-abstract-full').style.display = 'none'; document.getElementById('2410.10934v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The project can be found at https://github.com/metauto-ai/agent-as-a-judge. The dataset is released at https://huggingface.co/DEVAI-benchmark</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10278">arXiv:2410.10278</a> <span> [<a href="https://arxiv.org/pdf/2410.10278">pdf</a>, <a href="https://arxiv.org/format/2410.10278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Machine Translation Evaluation Benchmark for Wu Chinese: Workflow and Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hongjian Yu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yiming Shi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zherui Zhou</a>, <a href="/search/cs?searchtype=author&query=Haberland%2C+C">Christopher Haberland</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10278v1-abstract-short" style="display: inline;"> We introduce a FLORES+ dataset as an evaluation benchmark for modern Wu Chinese machine translation models and showcase its compatibility with existing Wu data. Wu Chinese is mutually unintelligible with other Sinitic languages such as Mandarin and Yue (Cantonese), but uses a set of Hanzi (Chinese characters) that profoundly overlaps with others. The population of Wu speakers is the second largest… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10278v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10278v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10278v1-abstract-full" style="display: none;"> We introduce a FLORES+ dataset as an evaluation benchmark for modern Wu Chinese machine translation models and showcase its compatibility with existing Wu data. Wu Chinese is mutually unintelligible with other Sinitic languages such as Mandarin and Yue (Cantonese), but uses a set of Hanzi (Chinese characters) that profoundly overlaps with others. The population of Wu speakers is the second largest among languages in China, but the language has been suffering from significant drop in usage especially among the younger generations. We identify Wu Chinese as a textually low-resource language and address challenges for its machine translation models. Our contributions include: (1) an open-source, manually translated dataset, (2) full documentations on the process of dataset creation and validation experiments, (3) preliminary tools for Wu Chinese normalization and segmentation, and (4) benefits and limitations of our dataset, as well as implications to other low-resource languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10278v1-abstract-full').style.display = 'none'; document.getElementById('2410.10278v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP WMT 24 Open Language Data Initiative Shared Task</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10130">arXiv:2410.10130</a> <span> [<a href="https://arxiv.org/pdf/2410.10130">pdf</a>, <a href="https://arxiv.org/format/2410.10130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> DecKG: Decentralized Collaborative Learning with Knowledge Graph Enhancement for POI Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+R">Ruiqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liang Qu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+G">Guanhua Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuhui Shi</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongzhi Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10130v1-abstract-short" style="display: inline;"> Decentralized collaborative learning for Point-of-Interest (POI) recommendation has gained research interest due to its advantages in privacy preservation and efficiency, as it keeps data locally and leverages collaborative learning among clients to train models in a decentralized manner. However, since local data is often limited and insufficient for training accurate models, a common solution is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10130v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10130v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10130v1-abstract-full" style="display: none;"> Decentralized collaborative learning for Point-of-Interest (POI) recommendation has gained research interest due to its advantages in privacy preservation and efficiency, as it keeps data locally and leverages collaborative learning among clients to train models in a decentralized manner. However, since local data is often limited and insufficient for training accurate models, a common solution is integrating external knowledge as auxiliary information to enhance model performance. Nevertheless, this solution poses challenges for decentralized collaborative learning. Due to private nature of local data, identifying relevant auxiliary information specific to each user is non-trivial. Furthermore, resource-constrained local devices struggle to accommodate all auxiliary information, which places heavy burden on local storage. To fill the gap, we propose a novel decentralized collaborative learning with knowledge graph enhancement framework for POI recommendation (DecKG). Instead of directly uploading interacted items, users generate desensitized check-in data by uploading general categories of interacted items and sampling similar items from same category. The server then pretrains KG without sensitive user-item interactions and deploys relevant partitioned sub-KGs to individual users. Entities are further refined on the device, allowing client to client communication to exchange knowledge learned from local data and sub-KGs. Evaluations across two real-world datasets demonstrate DecKG's effectiveness recommendation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10130v1-abstract-full').style.display = 'none'; document.getElementById('2410.10130v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10118">arXiv:2410.10118</a> <span> [<a href="https://arxiv.org/pdf/2410.10118">pdf</a>, <a href="https://arxiv.org/format/2410.10118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> </div> </div> <p class="title is-5 mathjax"> Physical Consistency Bridges Heterogeneous Data in Molecular Multi-Task Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yuxuan Ren</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+D">Dihan Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Peiran Jin</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yu Shi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lin Huang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiyan He</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shengjie Luo</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+T">Tao Qin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tie-Yan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10118v1-abstract-short" style="display: inline;"> In recent years, machine learning has demonstrated impressive capability in handling molecular science tasks. To support various molecular properties at scale, machine learning models are trained in the multi-task learning paradigm. Nevertheless, data of different molecular properties are often not aligned: some quantities, e.g. equilibrium structure, demand more cost to compute than others, e.g.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10118v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10118v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10118v1-abstract-full" style="display: none;"> In recent years, machine learning has demonstrated impressive capability in handling molecular science tasks. To support various molecular properties at scale, machine learning models are trained in the multi-task learning paradigm. Nevertheless, data of different molecular properties are often not aligned: some quantities, e.g. equilibrium structure, demand more cost to compute than others, e.g. energy, so their data are often generated by cheaper computational methods at the cost of lower accuracy, which cannot be directly overcome through multi-task learning. Moreover, it is not straightforward to leverage abundant data of other tasks to benefit a particular task. To handle such data heterogeneity challenges, we exploit the specialty of molecular tasks that there are physical laws connecting them, and design consistency training approaches that allow different tasks to exchange information directly so as to improve one another. Particularly, we demonstrate that the more accurate energy data can improve the accuracy of structure prediction. We also find that consistency training can directly leverage force and off-equilibrium structure data to improve structure prediction, demonstrating a broad capability for integrating heterogeneous data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10118v1-abstract-full').style.display = 'none'; document.getElementById('2410.10118v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09562">arXiv:2410.09562</a> <span> [<a href="https://arxiv.org/pdf/2410.09562">pdf</a>, <a href="https://arxiv.org/format/2410.09562">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> SituFont: A Just-in-Time Adaptive Intervention System for Enhancing Mobile Readability in Situational Visual Impairments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yue%2C+K">Kun Yue</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingshan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingruo Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chun Yu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+K">Kexin Nie</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhiqi Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinghan Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chen Liang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuanchun Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09562v1-abstract-short" style="display: inline;"> Situational visual impairments (SVIs) significantly impact mobile readability, causing user discomfort and hindering information access. This paper introduces SituFont, a novel just-in-time adaptive intervention (JITAI) system designed to enhance mobile text readability by semi-automatically adjusting font parameters in response to real-time contextual changes. Leveraging smartphone sensors and a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09562v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09562v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09562v1-abstract-full" style="display: none;"> Situational visual impairments (SVIs) significantly impact mobile readability, causing user discomfort and hindering information access. This paper introduces SituFont, a novel just-in-time adaptive intervention (JITAI) system designed to enhance mobile text readability by semi-automatically adjusting font parameters in response to real-time contextual changes. Leveraging smartphone sensors and a human-in-the-loop approach, SituFont personalizes the reading experience by adapting to individual user preferences, including personal factors such as fatigue and distraction level, and environmental factors like lighting, motion, and location. To inform the design of SituFont, we conducted formative interviews (N=15) to identify key SVI factors affecting readability and controlled experiments (N=18) to quantify the relationship between these factors and optimal text parameters. We then evaluated SituFont's effectiveness through a comparative user study under eight simulated SVI scenarios (N=12), demonstrating its ability to overcome SVIs. Our findings highlight the potential of JITAI systems like SituFont to mitigate the impact of SVIs and enhance mobile accessibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09562v1-abstract-full').style.display = 'none'; document.getElementById('2410.09562v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08893">arXiv:2410.08893</a> <span> [<a href="https://arxiv.org/pdf/2410.08893">pdf</a>, <a href="https://arxiv.org/format/2410.08893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Drama: Mamba-Enabled Model-Based Reinforcement Learning Is Sample and Parameter Efficient </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenlong Wang</a>, <a href="/search/cs?searchtype=author&query=Dusparic%2C+I">Ivana Dusparic</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yucheng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Ke Zhang</a>, <a href="/search/cs?searchtype=author&query=Cahill%2C+V">Vinny Cahill</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08893v1-abstract-short" style="display: inline;"> Model-based reinforcement learning (RL) offers a solution to the data inefficiency that plagues most model-free RL algorithms. However, learning a robust world model often demands complex and deep architectures, which are expensive to compute and train. Within the world model, dynamics models are particularly crucial for accurate predictions, and various dynamics-model architectures have been expl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08893v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08893v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08893v1-abstract-full" style="display: none;"> Model-based reinforcement learning (RL) offers a solution to the data inefficiency that plagues most model-free RL algorithms. However, learning a robust world model often demands complex and deep architectures, which are expensive to compute and train. Within the world model, dynamics models are particularly crucial for accurate predictions, and various dynamics-model architectures have been explored, each with its own set of challenges. Currently, recurrent neural network (RNN) based world models face issues such as vanishing gradients and difficulty in capturing long-term dependencies effectively. In contrast, use of transformers suffers from the well-known issues of self-attention mechanisms, where both memory and computational complexity scale as $O(n^2)$, with $n$ representing the sequence length. To address these challenges we propose a state space model (SSM) based world model, specifically based on Mamba, that achieves $O(n)$ memory and computational complexity while effectively capturing long-term dependencies and facilitating the use of longer training sequences efficiently. We also introduce a novel sampling method to mitigate the suboptimality caused by an incorrect world model in the early stages of training, combining it with the aforementioned technique to achieve a normalised score comparable to other state-of-the-art model-based RL algorithms using only a 7 million trainable parameter world model. This model is accessible and can be trained on an off-the-shelf laptop. Our code is available at https://github.com/realwenlongwang/drama.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08893v1-abstract-full').style.display = 'none'; document.getElementById('2410.08893v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08582">arXiv:2410.08582</a> <span> [<a href="https://arxiv.org/pdf/2410.08582">pdf</a>, <a href="https://arxiv.org/ps/2410.08582">ps</a>, <a href="https://arxiv.org/format/2410.08582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DeBiFormer: Vision Transformer with Deformable Agent Bi-level Routing Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+N+H+B">Nguyen Huu Bao Long</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuzhi Shi</a>, <a href="/search/cs?searchtype=author&query=Hirakawa%2C+T">Tsubasa Hirakawa</a>, <a href="/search/cs?searchtype=author&query=Yamashita%2C+T">Takayoshi Yamashita</a>, <a href="/search/cs?searchtype=author&query=Matsui%2C+T">Tohgoroh Matsui</a>, <a href="/search/cs?searchtype=author&query=Fujiyoshi%2C+H">Hironobu Fujiyoshi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08582v1-abstract-short" style="display: inline;"> Vision Transformers with various attention modules have demonstrated superior performance on vision tasks. While using sparsity-adaptive attention, such as in DAT, has yielded strong results in image classification, the key-value pairs selected by deformable points lack semantic relevance when fine-tuning for semantic segmentation tasks. The query-aware sparsity attention in BiFormer seeks to focu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08582v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08582v1-abstract-full" style="display: none;"> Vision Transformers with various attention modules have demonstrated superior performance on vision tasks. While using sparsity-adaptive attention, such as in DAT, has yielded strong results in image classification, the key-value pairs selected by deformable points lack semantic relevance when fine-tuning for semantic segmentation tasks. The query-aware sparsity attention in BiFormer seeks to focus each query on top-k routed regions. However, during attention calculation, the selected key-value pairs are influenced by too many irrelevant queries, reducing attention on the more important ones. To address these issues, we propose the Deformable Bi-level Routing Attention (DBRA) module, which optimizes the selection of key-value pairs using agent queries and enhances the interpretability of queries in attention maps. Based on this, we introduce the Deformable Bi-level Routing Attention Transformer (DeBiFormer), a novel general-purpose vision transformer built with the DBRA module. DeBiFormer has been validated on various computer vision tasks, including image classification, object detection, and semantic segmentation, providing strong evidence of its effectiveness.Code is available at {https://github.com/maclong01/DeBiFormer} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08582v1-abstract-full').style.display = 'none'; document.getElementById('2410.08582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 7 figures. arXiv admin note: text overlap with arXiv:2303.08810 by other authors</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ACCV 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08260">arXiv:2410.08260</a> <span> [<a href="https://arxiv.org/pdf/2410.08260">pdf</a>, <a href="https://arxiv.org/format/2410.08260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Koala-36M: A Large-scale Video Dataset Improving Consistency between Fine-grained Conditions and Video Content </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuheng Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yukai Shi</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+J">Jiarong Ou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Rui Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Ke Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Boyuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haotian Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+M">Mingwu Zheng</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xin Tao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fei Yang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+P">Pengfei Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08260v1-abstract-short" style="display: inline;"> As visual generation technologies continue to advance, the scale of video datasets has expanded rapidly, and the quality of these datasets is critical to the performance of video generation models. We argue that temporal splitting, detailed captions, and video quality filtering are three key factors that determine dataset quality. However, existing datasets exhibit various limitations in these are… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08260v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08260v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08260v1-abstract-full" style="display: none;"> As visual generation technologies continue to advance, the scale of video datasets has expanded rapidly, and the quality of these datasets is critical to the performance of video generation models. We argue that temporal splitting, detailed captions, and video quality filtering are three key factors that determine dataset quality. However, existing datasets exhibit various limitations in these areas. To address these challenges, we introduce Koala-36M, a large-scale, high-quality video dataset featuring accurate temporal splitting, detailed captions, and superior video quality. The core of our approach lies in improving the consistency between fine-grained conditions and video content. Specifically, we employ a linear classifier on probability distributions to enhance the accuracy of transition detection, ensuring better temporal consistency. We then provide structured captions for the splitted videos, with an average length of 200 words, to improve text-video alignment. Additionally, we develop a Video Training Suitability Score (VTSS) that integrates multiple sub-metrics, allowing us to filter high-quality videos from the original corpus. Finally, we incorporate several metrics into the training process of the generation model, further refining the fine-grained conditions. Our experiments demonstrate the effectiveness of our data processing pipeline and the quality of the proposed Koala-36M dataset. Our dataset and code will be released at https://koala36m.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08260v1-abstract-full').style.display = 'none'; document.getElementById('2410.08260v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://koala36m.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07535">arXiv:2410.07535</a> <span> [<a href="https://arxiv.org/pdf/2410.07535">pdf</a>, <a href="https://arxiv.org/format/2410.07535">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Constraint representation towards precise data-driven storytelling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yu-Zhe Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haotian Li</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+L">Lecheng Ruan</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+H">Huamin Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07535v1-abstract-short" style="display: inline;"> Data-driven storytelling serves as a crucial bridge for communicating ideas in a persuasive way. However, the manual creation of data stories is a multifaceted, labor-intensive, and case-specific effort, limiting their broader application. As a result, automating the creation of data stories has emerged as a significant research thrust. Despite advances in Artificial Intelligence, the systematic g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07535v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07535v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07535v1-abstract-full" style="display: none;"> Data-driven storytelling serves as a crucial bridge for communicating ideas in a persuasive way. However, the manual creation of data stories is a multifaceted, labor-intensive, and case-specific effort, limiting their broader application. As a result, automating the creation of data stories has emerged as a significant research thrust. Despite advances in Artificial Intelligence, the systematic generation of data stories remains challenging due to their hybrid nature: they must frame a perspective based on a seed idea in a top-down manner, similar to traditional storytelling, while coherently grounding insights of given evidence in a bottom-up fashion, akin to data analysis. These dual requirements necessitate precise constraints on the permissible space of a data story. In this viewpoint, we propose integrating constraints into the data story generation process. Defined upon the hierarchies of interpretation and articulation, constraints shape both narrations and illustrations to align with seed ideas and contextualized evidence. We identify the taxonomy and required functionalities of these constraints. Although constraints can be heterogeneous and latent, we explore the potential to represent them in a computation-friendly fashion via Domain-Specific Languages. We believe that leveraging constraints will facilitate both artistic and scientific aspects of data story generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07535v1-abstract-full').style.display = 'none'; document.getElementById('2410.07535v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In 2024 IEEE Visualization and Visual Analytics Gen4DS (VIS-Gen4DS'24)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07277">arXiv:2410.07277</a> <span> [<a href="https://arxiv.org/pdf/2410.07277">pdf</a>, <a href="https://arxiv.org/format/2410.07277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Swin-BERT: A Feature Fusion System designed for Speech-based Alzheimer's Dementia Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yilin Pan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yanpei Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yijia Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Mingyu Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07277v1-abstract-short" style="display: inline;"> Speech is usually used for constructing an automatic Alzheimer's dementia (AD) detection system, as the acoustic and linguistic abilities show a decline in people living with AD at the early stages. However, speech includes not only AD-related local and global information but also other information unrelated to cognitive status, such as age and gender. In this paper, we propose a speech-based syst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07277v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07277v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07277v1-abstract-full" style="display: none;"> Speech is usually used for constructing an automatic Alzheimer's dementia (AD) detection system, as the acoustic and linguistic abilities show a decline in people living with AD at the early stages. However, speech includes not only AD-related local and global information but also other information unrelated to cognitive status, such as age and gender. In this paper, we propose a speech-based system named Swin-BERT for automatic dementia detection. For the acoustic part, the shifted windows multi-head attention that proposed to extract local and global information from images, is used for designing our acoustic-based system. To decouple the effect of age and gender on acoustic feature extraction, they are used as an extra input of the designed acoustic system. For the linguistic part, the rhythm-related information, which varies significantly between people living with and without AD, is removed while transcribing the audio recordings into transcripts. To compensate for the removed rhythm-related information, the character-level transcripts are proposed to be used as the extra input of a word-level BERT-style system. Finally, the Swin-BERT combines the acoustic features learned from our proposed acoustic-based system with our linguistic-based system. The experiments are based on the two datasets provided by the international dementia detection challenges: the ADReSS and ADReSSo. The results show that both the proposed acoustic and linguistic systems can be better or comparable with previous research on the two datasets. Superior results are achieved by the proposed Swin-BERT system on the ADReSS and ADReSSo datasets, which are 85.58\% F-score and 87.32\% F-score respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07277v1-abstract-full').style.display = 'none'; document.getElementById('2410.07277v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05663">arXiv:2410.05663</a> <span> [<a href="https://arxiv.org/pdf/2410.05663">pdf</a>, <a href="https://arxiv.org/format/2410.05663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Abstract Hardware Grounding towards the Automated Design of Automation Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yu-Zhe Shi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiao Xu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+F">Fanxu Meng</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+L">Lecheng Ruan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qining Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05663v1-abstract-short" style="display: inline;"> Crafting automation systems tailored for specific domains requires aligning the space of human experts' semantics with the space of robot executable actions, and scheduling the required resources and system layout accordingly. Regrettably, there are three major gaps, fine-grained domain-specific knowledge injection, heterogeneity between human knowledge and robot instructions, and diversity of use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05663v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05663v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05663v1-abstract-full" style="display: none;"> Crafting automation systems tailored for specific domains requires aligning the space of human experts' semantics with the space of robot executable actions, and scheduling the required resources and system layout accordingly. Regrettably, there are three major gaps, fine-grained domain-specific knowledge injection, heterogeneity between human knowledge and robot instructions, and diversity of users' preferences, resulting automation system design a case-by-case and labour-intensive effort, thus hindering the democratization of automation. We refer to this challenging alignment as the abstract hardware grounding problem, where we firstly regard the procedural operations in humans' semantics space as the abstraction of hardware requirements, then we ground such abstractions to instantiated hardware devices, subject to constraints and preferences in the real world -- optimizing this problem is essentially standardizing and automating the design of automation systems. On this basis, we develop an automated design framework in a hybrid data-driven and principle-derived fashion. Results on designing self-driving laboratories for enhancing experiment-driven scientific discovery suggest our framework's potential to produce compact systems that fully satisfy domain-specific and user-customized requirements with no redundancy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05663v1-abstract-full').style.display = 'none'; document.getElementById('2410.05663v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In International Conference on Intelligent Robotics and Applications (ICIRA'24)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05262">arXiv:2410.05262</a> <span> [<a href="https://arxiv.org/pdf/2410.05262">pdf</a>, <a href="https://arxiv.org/format/2410.05262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TurtleBench: Evaluating Top Language Models via Real-World Yes/No Puzzles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qingchen Yu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shichao Song</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+K">Ke Fang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yunfeng Shi</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zifan Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanyu Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+S">Simin Niu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05262v1-abstract-short" style="display: inline;"> As the application of Large Language Models (LLMs) expands, the demand for reliable evaluations increases. Existing LLM evaluation benchmarks primarily rely on static datasets, making it challenging to assess model performance in dynamic interactions with users. Moreover, these benchmarks often depend on specific background knowledge, complicating the measurement of a model's logical reasoning cap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05262v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05262v1-abstract-full" style="display: none;"> As the application of Large Language Models (LLMs) expands, the demand for reliable evaluations increases. Existing LLM evaluation benchmarks primarily rely on static datasets, making it challenging to assess model performance in dynamic interactions with users. Moreover, these benchmarks often depend on specific background knowledge, complicating the measurement of a model's logical reasoning capabilities. Other dynamic evaluation methods based on strong models or manual efforts may introduce biases and incur high costs and time demands, hindering large-scale application. To address these issues, we propose TurtleBench. TurtleBench collects real user guesses from our online Turtle Soup Puzzle platform that we developed. This approach allows for the relatively dynamic generation of evaluation datasets, mitigating the risk of model cheating while aligning assessments more closely with genuine user needs for reasoning capabilities, thus enhancing the reliability of evaluations. TurtleBench includes 1,532 user guesses along with the correctness of guesses after annotation. Using this dataset, we thoroughly evaluated nine of the most advanced LLMs available today. Notably, the OpenAI o1 series models did not achieve leading results in these evaluations. We propose several hypotheses for further research, such as "the latent reasoning of o1 utilizes trivial Chain-of-Thought (CoT) techniques" and "increasing CoT length not only provides reasoning benefits but also incurs noise costs." <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05262v1-abstract-full').style.display = 'none'; document.getElementById('2410.05262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Shi%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository