Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 196 results for author: <span class="mathjax">Hu, K</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Hu%2C+K">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Hu, K"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Hu%2C+K&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Hu, K"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hu%2C+K&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hu%2C+K&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+K&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+K&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+K&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11493">arXiv:2411.11493</a> <span> [<a href="https://arxiv.org/pdf/2411.11493">pdf</a>, <a href="https://arxiv.org/format/2411.11493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> LSRAM: A Lightweight Autoscaling and SLO Resource Allocation Framework for Microservices Based on Gradient Descent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kan Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minxian Xu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+K">Kejiang Ye</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengzhong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11493v1-abstract-short" style="display: inline;"> Microservices architecture has become the dominant architecture in cloud computing paradigm with its advantages of facilitating development, deployment, modularity and scalability. The workflow of microservices architecture is transparent to the users, who are concerned with the quality of service (QoS). Taking Service Level Objective (SLO) as an important indicator of system resource scaling can… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11493v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11493v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11493v1-abstract-full" style="display: none;"> Microservices architecture has become the dominant architecture in cloud computing paradigm with its advantages of facilitating development, deployment, modularity and scalability. The workflow of microservices architecture is transparent to the users, who are concerned with the quality of service (QoS). Taking Service Level Objective (SLO) as an important indicator of system resource scaling can effectively ensure user's QoS, but how to quickly allocate end-to-end SLOs to each microservice in a complete service so that it can obtain the optimal SLO resource allocation scheme is still a challenging problem. Existing microservice autoscaling frameworks based on SLO resources often have heavy and complex models that demand substantial time and computational resources to get a suitable resource allocation scheme. Moreover, when the system environment or microservice application changes, these methods require significant time and resources for model retraining. In this paper, we propose LSRAM, a lightweight SLO resource allocation management framework based on the gradient descent method to overcome the limitation of existing methods in terms of heavy model, time-consuming, poor scalability, and difficulty in retraining. LSRAM has two stages: at stage one, the lightweight SLO resource allocation model from LSRAM can quickly compute the appropriate SLO resources for each microservice; at stage two, LSRAM's SLO resource update model enables the entire framework to quickly adapt to changes in the cluster environment (e.g. load and applications). Additionally, LSRAM can effectively handle bursty traffic and highly fluctuating load application scenarios. Compared to state-of-the-art SLO allocation frameworks, LSRAM not only guarantees users' QoS but also reduces resource usage by 17%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11493v1-abstract-full').style.display = 'none'; document.getElementById('2411.11493v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Software: Practice and Experience 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08063">arXiv:2411.08063</a> <span> [<a href="https://arxiv.org/pdf/2411.08063">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MatPilot: an LLM-enabled AI Materials Scientist under the Framework of Human-Machine Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ni%2C+Z">Ziqi Ni</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yahao Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kaijia Hu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+K">Kunyuan Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Ming Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fengqi Liu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yicong Ye</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+S">Shuxin Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08063v1-abstract-short" style="display: inline;"> The rapid evolution of artificial intelligence, particularly large language models, presents unprecedented opportunities for materials science research. We proposed and developed an AI materials scientist named MatPilot, which has shown encouraging abilities in the discovery of new materials. The core strength of MatPilot is its natural language interactive human-machine collaboration, which augme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08063v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08063v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08063v1-abstract-full" style="display: none;"> The rapid evolution of artificial intelligence, particularly large language models, presents unprecedented opportunities for materials science research. We proposed and developed an AI materials scientist named MatPilot, which has shown encouraging abilities in the discovery of new materials. The core strength of MatPilot is its natural language interactive human-machine collaboration, which augments the research capabilities of human scientist teams through a multi-agent system. MatPilot integrates unique cognitive abilities, extensive accumulated experience, and ongoing curiosity of human-beings with the AI agents' capabilities of advanced abstraction, complex knowledge storage and high-dimensional information processing. It could generate scientific hypotheses and experimental schemes, and employ predictive models and optimization algorithms to drive an automated experimental platform for experiments. It turns out that our system demonstrates capabilities for efficient validation, continuous learning, and iterative optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08063v1-abstract-full').style.display = 'none'; document.getElementById('2411.08063v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06508">arXiv:2411.06508</a> <span> [<a href="https://arxiv.org/pdf/2411.06508">pdf</a>, <a href="https://arxiv.org/format/2411.06508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Understanding the Role of Equivariance in Self-supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifei Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kaiwen Hu</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+S">Sharut Gupta</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Ziyu Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yisen Wang</a>, <a href="/search/cs?searchtype=author&query=Jegelka%2C+S">Stefanie Jegelka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06508v1-abstract-short" style="display: inline;"> Contrastive learning has been a leading paradigm for self-supervised learning, but it is widely observed that it comes at the price of sacrificing useful features (\eg colors) by being invariant to data augmentations. Given this limitation, there has been a surge of interest in equivariant self-supervised learning (E-SSL) that learns features to be augmentation-aware. However, even for the simples… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06508v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06508v1-abstract-full" style="display: none;"> Contrastive learning has been a leading paradigm for self-supervised learning, but it is widely observed that it comes at the price of sacrificing useful features (\eg colors) by being invariant to data augmentations. Given this limitation, there has been a surge of interest in equivariant self-supervised learning (E-SSL) that learns features to be augmentation-aware. However, even for the simplest rotation prediction method, there is a lack of rigorous understanding of why, when, and how E-SSL learns useful features for downstream tasks. To bridge this gap between practice and theory, we establish an information-theoretic perspective to understand the generalization ability of E-SSL. In particular, we identify a critical explaining-away effect in E-SSL that creates a synergy between the equivariant and classification tasks. This synergy effect encourages models to extract class-relevant features to improve its equivariant prediction, which, in turn, benefits downstream tasks requiring semantic features. Based on this perspective, we theoretically analyze the influence of data transformations and reveal several principles for practical designs of E-SSL. Our theory not only aligns well with existing E-SSL methods but also sheds light on new directions by exploring the benefits of model equivariance. We believe that a theoretically grounded understanding on the role of equivariance would inspire more principled and advanced designs in this field. Code is available at https://github.com/kaotty/Understanding-ESSL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06508v1-abstract-full').style.display = 'none'; document.getElementById('2411.06508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05945">arXiv:2411.05945</a> <span> [<a href="https://arxiv.org/pdf/2411.05945">pdf</a>, <a href="https://arxiv.org/format/2411.05945">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> NeKo: Toward Post Recognition Generative Correction Large Language Models with Task-Oriented Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yen-Ting Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/cs?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuesong Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zih-Ching Chen</a>, <a href="/search/cs?searchtype=author&query=Puvvada%2C+K+C">Krishna C Puvvada</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+S">Szu-Wei Fu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+J+W">Jun Wei Chiu</a>, <a href="/search/cs?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/cs?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05945v1-abstract-short" style="display: inline;"> Construction of a general-purpose post-recognition error corrector poses a crucial question: how can we most effectively train a model on a large mixture of domain datasets? The answer would lie in learning dataset-specific features and digesting their knowledge in a single model. Previous methods achieve this by having separate correction language models, resulting in a significant increase in pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05945v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05945v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05945v1-abstract-full" style="display: none;"> Construction of a general-purpose post-recognition error corrector poses a crucial question: how can we most effectively train a model on a large mixture of domain datasets? The answer would lie in learning dataset-specific features and digesting their knowledge in a single model. Previous methods achieve this by having separate correction language models, resulting in a significant increase in parameters. In this work, we present Mixture-of-Experts as a solution, highlighting that MoEs are much more than a scalability tool. We propose a Multi-Task Correction MoE, where we train the experts to become an ``expert'' of speech-to-text, language-to-text and vision-to-text datasets by learning to route each dataset's tokens to its mapped expert. Experiments on the Open ASR Leaderboard show that we explore a new state-of-the-art performance by achieving an average relative $5.0$% WER reduction and substantial improvements in BLEU scores for speech and translation tasks. On zero-shot evaluation, NeKo outperforms GPT-3.5 and Claude-Opus with $15.5$% to $27.6$% relative WER reduction in the Hyporadise benchmark. NeKo performs competitively on grammar and post-OCR correction as a multi-task model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05945v1-abstract-full').style.display = 'none'; document.getElementById('2411.05945v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeKo work has been done in June 2024. NeKo LMs will be open source on https://huggingface.co/nvidia under the MIT license</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04919">arXiv:2411.04919</a> <span> [<a href="https://arxiv.org/pdf/2411.04919">pdf</a>, <a href="https://arxiv.org/format/2411.04919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stem-OB: Generalizable Visual Imitation Learning with Stem-Like Convergent Observation through Diffusion Inversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kaizhe Hu</a>, <a href="/search/cs?searchtype=author&query=Rui%2C+Z">Zihang Rui</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yao He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuyao Liu</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+P">Pu Hua</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Huazhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04919v2-abstract-short" style="display: inline;"> Visual imitation learning methods demonstrate strong performance, yet they lack generalization when faced with visual input perturbations, including variations in lighting and textures, impeding their real-world application. We propose Stem-OB that utilizes pretrained image diffusion models to suppress low-level visual differences while maintaining high-level scene structures. This image inversion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04919v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04919v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04919v2-abstract-full" style="display: none;"> Visual imitation learning methods demonstrate strong performance, yet they lack generalization when faced with visual input perturbations, including variations in lighting and textures, impeding their real-world application. We propose Stem-OB that utilizes pretrained image diffusion models to suppress low-level visual differences while maintaining high-level scene structures. This image inversion process is akin to transforming the observation into a shared representation, from which other observations stem, with extraneous details removed. Stem-OB contrasts with data-augmentation approaches as it is robust to various unspecified appearance changes without the need for additional training. Our method is a simple yet highly effective plug-and-play solution. Empirical results confirm the effectiveness of our approach in simulated tasks and show an exceptionally significant improvement in real-world applications, with an average increase of 22.2% in success rates compared to the best baseline. See https://hukz18.github.io/Stem-Ob/ for more info. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04919v2-abstract-full').style.display = 'none'; document.getElementById('2411.04919v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Arxiv preprint version, website: https://hukz18.github.io/Stem-Ob/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02272">arXiv:2411.02272</a> <span> [<a href="https://arxiv.org/pdf/2411.02272">pdf</a>, <a href="https://arxiv.org/format/2411.02272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Combining Induction and Transduction for Abstract Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wen-Ding Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Keya Hu</a>, <a href="/search/cs?searchtype=author&query=Larsen%2C+C">Carter Larsen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuqing Wu</a>, <a href="/search/cs?searchtype=author&query=Alford%2C+S">Simon Alford</a>, <a href="/search/cs?searchtype=author&query=Woo%2C+C">Caleb Woo</a>, <a href="/search/cs?searchtype=author&query=Dunn%2C+S+M">Spencer M. Dunn</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hao Tang</a>, <a href="/search/cs?searchtype=author&query=Naim%2C+M">Michelangelo Naim</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+D">Dat Nguyen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wei-Long Zheng</a>, <a href="/search/cs?searchtype=author&query=Tavares%2C+Z">Zenna Tavares</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yewen Pu</a>, <a href="/search/cs?searchtype=author&query=Ellis%2C+K">Kevin Ellis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02272v3-abstract-short" style="display: inline;"> When learning an input-output mapping from very few examples, is it better to first infer a latent function that explains the examples, or is it better to directly predict new test outputs, e.g. using a neural network? We study this question on ARC, a highly diverse dataset of abstract reasoning tasks. We train neural models for induction (inferring latent functions) and transduction (directly pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02272v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02272v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02272v3-abstract-full" style="display: none;"> When learning an input-output mapping from very few examples, is it better to first infer a latent function that explains the examples, or is it better to directly predict new test outputs, e.g. using a neural network? We study this question on ARC, a highly diverse dataset of abstract reasoning tasks. We train neural models for induction (inferring latent functions) and transduction (directly predicting the test output for a given test input). Our models are trained on synthetic data generated by prompting LLMs to produce Python code specifying a function to be inferred, plus a stochastic subroutine for generating inputs to that function. We find inductive and transductive models solve very different problems, despite training on the same problems, and despite sharing the same neural architecture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02272v3-abstract-full').style.display = 'none'; document.getElementById('2411.02272v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22837">arXiv:2410.22837</a> <span> [<a href="https://arxiv.org/pdf/2410.22837">pdf</a>, <a href="https://arxiv.org/format/2410.22837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3233/FAIA240524">10.3233/FAIA240524 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SFDFusion: An Efficient Spatial-Frequency Domain Fusion Network for Infrared and Visible Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kun Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qingle Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Maoxun Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yitian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22837v1-abstract-short" style="display: inline;"> Infrared and visible image fusion aims to utilize the complementary information from two modalities to generate fused images with prominent targets and rich texture details. Most existing algorithms only perform pixel-level or feature-level fusion from different modalities in the spatial domain. They usually overlook the information in the frequency domain, and some of them suffer from inefficienc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22837v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22837v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22837v1-abstract-full" style="display: none;"> Infrared and visible image fusion aims to utilize the complementary information from two modalities to generate fused images with prominent targets and rich texture details. Most existing algorithms only perform pixel-level or feature-level fusion from different modalities in the spatial domain. They usually overlook the information in the frequency domain, and some of them suffer from inefficiency due to excessively complex structures. To tackle these challenges, this paper proposes an efficient Spatial-Frequency Domain Fusion (SFDFusion) network for infrared and visible image fusion. First, we propose a Dual-Modality Refinement Module (DMRM) to extract complementary information. This module extracts useful information from both the infrared and visible modalities in the spatial domain and enhances fine-grained spatial details. Next, to introduce frequency domain information, we construct a Frequency Domain Fusion Module (FDFM) that transforms the spatial domain to the frequency domain through Fast Fourier Transform (FFT) and then integrates frequency domain information. Additionally, we design a frequency domain fusion loss to provide guidance for the fusion process. Extensive experiments on public datasets demonstrate that our method produces fused images with significant advantages in various fusion metrics and visual effects. Furthermore, our method demonstrates high efficiency in image fusion and good performance on downstream detection tasks, thereby satisfying the real-time demands of advanced visual tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22837v1-abstract-full').style.display = 'none'; document.getElementById('2410.22837v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accept in ECAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17485">arXiv:2410.17485</a> <span> [<a href="https://arxiv.org/pdf/2410.17485">pdf</a>, <a href="https://arxiv.org/format/2410.17485">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VoiceTextBlender: Augmenting Large Language Models with Speech Capabilities via Single-Stage Joint Speech-Text Supervised Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/cs?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/cs?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/cs?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/cs?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17485v1-abstract-short" style="display: inline;"> Recent studies have augmented large language models (LLMs) with speech capabilities, leading to the development of speech language models (SpeechLMs). Earlier SpeechLMs focused on single-turn speech-based question answering (QA), where user input comprised a speech context and a text question. More recent studies have extended this to multi-turn conversations, though they often require complex, mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17485v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17485v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17485v1-abstract-full" style="display: none;"> Recent studies have augmented large language models (LLMs) with speech capabilities, leading to the development of speech language models (SpeechLMs). Earlier SpeechLMs focused on single-turn speech-based question answering (QA), where user input comprised a speech context and a text question. More recent studies have extended this to multi-turn conversations, though they often require complex, multi-stage supervised fine-tuning (SFT) with diverse data. Another critical challenge with SpeechLMs is catastrophic forgetting-where models optimized for speech tasks suffer significant degradation in text-only performance. To mitigate these issues, we propose a novel single-stage joint speech-text SFT approach on the low-rank adaptation (LoRA) of the LLM backbone. Our joint SFT combines text-only SFT data with three types of speech-related data: speech recognition and translation, speech-based QA, and mixed-modal SFT. Compared to previous SpeechLMs with 7B or 13B parameters, our 3B model demonstrates superior performance across various speech benchmarks while preserving the original capabilities on text-only tasks. Furthermore, our model shows emergent abilities of effectively handling previously unseen prompts and tasks, including multi-turn, mixed-modal inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17485v1-abstract-full').style.display = 'none'; document.getElementById('2410.17485v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16209">arXiv:2409.16209</a> <span> [<a href="https://arxiv.org/pdf/2409.16209">pdf</a>, <a href="https://arxiv.org/format/2409.16209">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LLMCount: Enhancing Stationary mmWave Detection with Multimodal-LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyan Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shengyi Ding</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">Deen Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yixuan Wu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+H">Hongjie Liao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kaiyuan Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16209v2-abstract-short" style="display: inline;"> Millimeter wave sensing provides people with the capability of sensing the surrounding crowds in a non-invasive and privacy-preserving manner, which holds huge application potential. However, detecting stationary crowds remains challenging due to several factors such as minimal movements (like breathing or casual fidgets), which can be easily treated as noise clusters during data collection and co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16209v2-abstract-full').style.display = 'inline'; document.getElementById('2409.16209v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16209v2-abstract-full" style="display: none;"> Millimeter wave sensing provides people with the capability of sensing the surrounding crowds in a non-invasive and privacy-preserving manner, which holds huge application potential. However, detecting stationary crowds remains challenging due to several factors such as minimal movements (like breathing or casual fidgets), which can be easily treated as noise clusters during data collection and consequently filtered in the following processing procedures. Additionally, the uneven distribution of signal power due to signal power attenuation and interferences resulting from external reflectors or absorbers further complicates accurate detection. To address these challenges and enable stationary crowd detection across various application scenarios requiring specialized domain adaption, we introduce LLMCount, the first system to harness the capabilities of large-language models (LLMs) to enhance crowd detection performance. By exploiting the decision-making capability of LLM, we can successfully compensate the signal power to acquire a uniform distribution and thereby achieve a detection with higher accuracy. To assess the system's performance, comprehensive evaluations are conducted under diversified scenarios like hall, meeting room, and cinema. The evaluation results show that our proposed approach reaches high detection accuracy with lower overall latency compared with previous methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16209v2-abstract-full').style.display = 'none'; document.getElementById('2409.16209v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15352">arXiv:2409.15352</a> <span> [<a href="https://arxiv.org/pdf/2409.15352">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> An Interactive Web Application for School-Based Physical Fitness Testing in California: Geospatial Analysis and Custom Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yawen Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kaiyuan Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Di Hu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kai Zheng</a>, <a href="/search/cs?searchtype=author&query=Cooper%2C+D">Dan Cooper</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15352v1-abstract-short" style="display: inline;"> Physical activity is essential for children's healthy growth and development. In the US, most states, including California, adhere to physical education standards and have implemented the mandated School-based Physical Fitness Testing (SB-PFT) for over two decades. Despite extensive data collection, research utilization of SB-PFT has been limited due to the absence of accessible analytical tools.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15352v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15352v1-abstract-full" style="display: none;"> Physical activity is essential for children's healthy growth and development. In the US, most states, including California, adhere to physical education standards and have implemented the mandated School-based Physical Fitness Testing (SB-PFT) for over two decades. Despite extensive data collection, research utilization of SB-PFT has been limited due to the absence of accessible analytical tools. We developed a web application using GeoServer, ArcGIS, and AWS to visualize SB-PFT data. This user-friendly platform enables education administrators and policymakers to analyze trends in children's physical fitness, identify successful programs at schools and districts, and evaluate new physical education initiatives. The application also features a custom mapping tool for comparing external datasets with SB-PFT data. We conclude that this platform, by integrating advanced analytical capabilities in an informatics-based tool, significantly enhances engagement in promoting children's physical fitness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15352v1-abstract-full').style.display = 'none'; document.getElementById('2409.15352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AMIA Annual Symposium Proceedings 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14953">arXiv:2409.14953</a> <span> [<a href="https://arxiv.org/pdf/2409.14953">pdf</a>, <a href="https://arxiv.org/format/2409.14953">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> MSARS: A Meta-Learning and Reinforcement Learning Framework for SLO Resource Allocation and Adaptive Scaling for Microservices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kan Hu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+L">Linfeng Wen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minxian Xu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+K">Kejiang Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14953v1-abstract-short" style="display: inline;"> Service Level Objectives (SLOs) aim to set threshold for service time in cloud services to ensure acceptable quality of service (QoS) and user satisfaction. Currently, many studies consider SLOs as a system resource to be allocated, ensuring QoS meets the SLOs. Existing microservice auto-scaling frameworks that rely on SLO resources often utilize complex and computationally intensive models, requi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14953v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14953v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14953v1-abstract-full" style="display: none;"> Service Level Objectives (SLOs) aim to set threshold for service time in cloud services to ensure acceptable quality of service (QoS) and user satisfaction. Currently, many studies consider SLOs as a system resource to be allocated, ensuring QoS meets the SLOs. Existing microservice auto-scaling frameworks that rely on SLO resources often utilize complex and computationally intensive models, requiring significant time and resources to determine appropriate resource allocation. This paper aims to rapidly allocate SLO resources and minimize resource costs while ensuring application QoS meets the SLO requirements in a dynamically changing microservice environment. We propose MSARS, a framework that leverages meta-learning to quickly derive SLO resource allocation strategies and employs reinforcement learning for adaptive scaling of microservice resources. It features three innovative components: First, MSARS uses graph convolutional networks to predict the most suitable SLO resource allocation scheme for the current environment. Second, MSARS utilizes meta-learning to enable the graph neural network to quickly adapt to environmental changes ensuring adaptability in highly dynamic microservice environments. Third, MSARS generates auto-scaling policies for each microservice based on an improved Twin Delayed Deep Deterministic Policy Gradient (TD3) model. The adaptive auto-scaling policy integrates the SLO resource allocation strategy into the scheduling algorithm to satisfy SLOs. Finally, we compare MSARS with state-of-the-art resource auto-scaling algorithms that utilize neural networks and reinforcement learning, MSARS takes 40% less time to adapt to new environments, 38% reduction of SLO violations, and 8% less resources cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14953v1-abstract-full').style.display = 'none'; document.getElementById('2409.14953v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, IEEE ISPA 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13523">arXiv:2409.13523</a> <span> [<a href="https://arxiv.org/pdf/2409.13523">pdf</a>, <a href="https://arxiv.org/format/2409.13523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> EMMeTT: Efficient Multimodal Machine Translation Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengru Wang</a>, <a href="/search/cs?searchtype=author&query=Galvez%2C+D">Daniel Galvez</a>, <a href="/search/cs?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shuoyang Ding</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/cs?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/cs?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/cs?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13523v1-abstract-short" style="display: inline;"> A rising interest in the modality extension of foundation language models warrants discussion on the most effective, and efficient, multimodal training approach. This work focuses on neural machine translation (NMT) and proposes a joint multimodal training regime of Speech-LLM to include automatic speech translation (AST). We investigate two different foundation model architectures, decoder-only G… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13523v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13523v1-abstract-full" style="display: none;"> A rising interest in the modality extension of foundation language models warrants discussion on the most effective, and efficient, multimodal training approach. This work focuses on neural machine translation (NMT) and proposes a joint multimodal training regime of Speech-LLM to include automatic speech translation (AST). We investigate two different foundation model architectures, decoder-only GPT and encoder-decoder T5, extended with Canary-1B's speech encoder. To handle joint multimodal training, we propose a novel training framework called EMMeTT. EMMeTT improves training efficiency with the following: balanced sampling across languages, datasets, and modalities; efficient sequential data iteration; and a novel 2D bucketing scheme for multimodal data, complemented by a batch size optimizer (OOMptimizer). We show that a multimodal training consistently helps with both architectures. Moreover, SALM-T5 trained with EMMeTT retains the original NMT capability while outperforming AST baselines on four-language subsets of FLORES and FLEURS. The resultant Multimodal Translation Model produces strong text and speech translation results at the same time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13523v1-abstract-full').style.display = 'none'; document.getElementById('2409.13523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11538">arXiv:2409.11538</a> <span> [<a href="https://arxiv.org/pdf/2409.11538">pdf</a>, <a href="https://arxiv.org/format/2409.11538">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Chain-of-Thought Prompting for Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/cs?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/cs?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/cs?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/cs?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/cs?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11538v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable advancements in language understanding and generation. Building on the success of text-based LLMs, recent research has adapted these models to use speech embeddings for prompting, resulting in Speech-LLM models that exhibit strong performance in automatic speech recognition (ASR) and automatic speech translation (AST). In this work, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11538v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11538v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11538v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable advancements in language understanding and generation. Building on the success of text-based LLMs, recent research has adapted these models to use speech embeddings for prompting, resulting in Speech-LLM models that exhibit strong performance in automatic speech recognition (ASR) and automatic speech translation (AST). In this work, we propose a novel approach to leverage ASR transcripts as prompts for AST in a Speech-LLM built on an encoder-decoder text LLM. The Speech-LLM model consists of a speech encoder and an encoder-decoder structure Megatron-T5. By first decoding speech to generate ASR transcripts and subsequently using these transcripts along with encoded speech for prompting, we guide the speech translation in a two-step process like chain-of-thought (CoT) prompting. Low-rank adaptation (LoRA) is used for the T5 LLM for model adaptation and shows superior performance to full model fine-tuning. Experimental results show that the proposed CoT prompting significantly improves AST performance, achieving an average increase of 2.4 BLEU points across 6 En->X or X->En AST tasks compared to speech prompting alone. Additionally, compared to a related CoT prediction method that predicts a concatenated sequence of ASR and AST transcripts, our method performs better by an average of 2 BLEU points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11538v1-abstract-full').style.display = 'none'; document.getElementById('2409.11538v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00884">arXiv:2409.00884</a> <span> [<a href="https://arxiv.org/pdf/2409.00884">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Novel Hybrid Parameter-Efficient Fine-Tuning Approach for Hippocampus Segmentation and Alzheimer's Disease Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wangang Cheng</a>, <a href="/search/cs?searchtype=author&query=He%2C+G">Guanghua He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Keli Hu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Mingyu Fang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+L">Liang Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhong Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hancan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00884v1-abstract-short" style="display: inline;"> Deep learning methods have significantly advanced medical image segmentation, yet their success hinges on large volumes of manually annotated data, which require specialized expertise for accurate labeling. Additionally, these methods often demand substantial computational resources, particularly for three-dimensional medical imaging tasks. Consequently, applying deep learning techniques for medic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00884v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00884v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00884v1-abstract-full" style="display: none;"> Deep learning methods have significantly advanced medical image segmentation, yet their success hinges on large volumes of manually annotated data, which require specialized expertise for accurate labeling. Additionally, these methods often demand substantial computational resources, particularly for three-dimensional medical imaging tasks. Consequently, applying deep learning techniques for medical image segmentation with limited annotated data and computational resources remains a critical challenge. In this paper, we propose a novel parameter-efficient fine-tuning strategy, termed HyPS, which employs a hybrid parallel and serial architecture. HyPS updates a minimal subset of model parameters, thereby retaining the pre-trained model's original knowledge tructure while enhancing its ability to learn specific features relevant to downstream tasks. We apply this strategy to the state-of-the-art SwinUNETR model for medical image segmentation. Initially, the model is pre-trained on the BraTs2021 dataset, after which the HyPS method is employed to transfer it to three distinct hippocampus datasets.Extensive experiments demonstrate that HyPS outperforms baseline methods, especially in scenarios with limited training samples. Furthermore, based on the segmentation results, we calculated the hippocampal volumes of subjects from the ADNI dataset and combined these with metadata to classify disease types. In distinguishing Alzheimer's disease (AD) from cognitively normal (CN) individuals, as well as early mild cognitive impairment (EMCI) from late mild cognitive impairment (LMCI), HyPS achieved classification accuracies of 83.78% and 64.29%, respectively. These findings indicate that the HyPS method not only facilitates effective hippocampal segmentation using pre-trained models but also holds potential for aiding Alzheimer's disease detection. Our code is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00884v1-abstract-full').style.display = 'none'; document.getElementById('2409.00884v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00353">arXiv:2409.00353</a> <span> [<a href="https://arxiv.org/pdf/2409.00353">pdf</a>, <a href="https://arxiv.org/format/2409.00353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RI-MAE: Rotation-Invariant Masked AutoEncoders for Self-Supervised Point Cloud Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+K">Kunming Su</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiuxia Wu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+P">Panpan Cai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaogang Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xuequan Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kun Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00353v1-abstract-short" style="display: inline;"> Masked point modeling methods have recently achieved great success in self-supervised learning for point cloud data. However, these methods are sensitive to rotations and often exhibit sharp performance drops when encountering rotational variations. In this paper, we propose a novel Rotation-Invariant Masked AutoEncoders (RI-MAE) to address two major challenges: 1) achieving rotation-invariant lat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00353v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00353v1-abstract-full" style="display: none;"> Masked point modeling methods have recently achieved great success in self-supervised learning for point cloud data. However, these methods are sensitive to rotations and often exhibit sharp performance drops when encountering rotational variations. In this paper, we propose a novel Rotation-Invariant Masked AutoEncoders (RI-MAE) to address two major challenges: 1) achieving rotation-invariant latent representations, and 2) facilitating self-supervised reconstruction in a rotation-invariant manner. For the first challenge, we introduce RI-Transformer, which features disentangled geometry content, rotation-invariant relative orientation and position embedding mechanisms for constructing rotation-invariant point cloud latent space. For the second challenge, a novel dual-branch student-teacher architecture is devised. It enables the self-supervised learning via the reconstruction of masked patches within the learned rotation-invariant latent space. Each branch is based on an RI-Transformer, and they are connected with an additional RI-Transformer predictor. The teacher encodes all point patches, while the student solely encodes unmasked ones. Finally, the predictor predicts the latent features of the masked patches using the output latent embeddings from the student, supervised by the outputs from the teacher. Extensive experiments demonstrate that our method is robust to rotations, achieving the state-of-the-art performance on various downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00353v1-abstract-full').style.display = 'none'; document.getElementById('2409.00353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15829">arXiv:2408.15829</a> <span> [<a href="https://arxiv.org/pdf/2408.15829">pdf</a>, <a href="https://arxiv.org/format/2408.15829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SITransformer: Shared Information-Guided Transformer for Extreme Multimodal Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sicheng Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lintao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaogan Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xuequan Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kun Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15829v2-abstract-short" style="display: inline;"> Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an attractive summarization approach by integrating various types of information to create extremely concise yet informative summaries for individual modalities. Existing methods overlook the issue that multimodal data often contains more topic irrelevant information, which can mislead the model into producing inaccurate summa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15829v2-abstract-full').style.display = 'inline'; document.getElementById('2408.15829v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15829v2-abstract-full" style="display: none;"> Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an attractive summarization approach by integrating various types of information to create extremely concise yet informative summaries for individual modalities. Existing methods overlook the issue that multimodal data often contains more topic irrelevant information, which can mislead the model into producing inaccurate summaries especially for extremely short ones. In this paper, we propose SITransformer, a Shared Information-guided Transformer for extreme multimodal summarization. It has a shared information guided pipeline which involves a cross-modal shared information extractor and a cross-modal interaction module. The extractor formulates semantically shared salient information from different modalities by devising a novel filtering process consisting of a differentiable top-k selector and a shared-information guided gating unit. As a result, the common, salient, and relevant contents across modalities are identified. Next, a transformer with cross-modal attentions is developed for intra- and inter-modality learning with the shared information guidance to produce the extreme summary. Comprehensive experiments demonstrate that SITransformer significantly enhances the summarization quality for both video and text summaries for XMSMO. Our code will be publicly available at https://github.com/SichengLeoLiu/MMAsia24-XMSMO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15829v2-abstract-full').style.display = 'none'; document.getElementById('2408.15829v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures, submitted to ACM Multimedia Asia 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12366">arXiv:2408.12366</a> <span> [<a href="https://arxiv.org/pdf/2408.12366">pdf</a>, <a href="https://arxiv.org/format/2408.12366">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robust Principal Component Analysis via Discriminant Sample Weight Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yingzhuo Deng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12366v1-abstract-short" style="display: inline;"> Principal component analysis (PCA) is a classical feature extraction method, but it may be adversely affected by outliers, resulting in inaccurate learning of the projection matrix. This paper proposes a robust method to estimate both the data mean and the PCA projection matrix by learning discriminant sample weights from data containing outliers. Each sample in the dataset is assigned a weight, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12366v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12366v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12366v1-abstract-full" style="display: none;"> Principal component analysis (PCA) is a classical feature extraction method, but it may be adversely affected by outliers, resulting in inaccurate learning of the projection matrix. This paper proposes a robust method to estimate both the data mean and the PCA projection matrix by learning discriminant sample weights from data containing outliers. Each sample in the dataset is assigned a weight, and the proposed algorithm iteratively learns the weights, the mean, and the projection matrix, respectively. Specifically, when the mean and the projection matrix are available, via fine-grained analysis of outliers, a weight for each sample is learned hierarchically so that outliers have small weights while normal samples have large weights. With the learned weights available, a weighted optimization problem is solved to estimate both the data mean and the projection matrix. Because the learned weights discriminate outliers from normal samples, the adverse influence of outliers is mitigated due to the corresponding small weights. Experiments on toy data, UCI dataset, and face dataset demonstrate the effectiveness of the proposed method in estimating the mean and the projection matrix from the data containing outliers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12366v1-abstract-full').style.display = 'none'; document.getElementById('2408.12366v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11494">arXiv:2408.11494</a> <span> [<a href="https://arxiv.org/pdf/2408.11494">pdf</a>, <a href="https://arxiv.org/ps/2408.11494">ps</a>, <a href="https://arxiv.org/format/2408.11494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mutagenesis screen to map the functions of parameters of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yue Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P+X">Patrick X. Zhao</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+J">Javed Khan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengming Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11494v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have significantly advanced artificial intelligence, excelling in numerous tasks. Although the functionality of a model is inherently tied to its parameters, a systematic method for exploring the connections between the parameters and the functionality are lacking. Models sharing similar structure and parameter counts exhibit significant performance disparities across… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11494v2-abstract-full').style.display = 'inline'; document.getElementById('2408.11494v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11494v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have significantly advanced artificial intelligence, excelling in numerous tasks. Although the functionality of a model is inherently tied to its parameters, a systematic method for exploring the connections between the parameters and the functionality are lacking. Models sharing similar structure and parameter counts exhibit significant performance disparities across various tasks, prompting investigations into the varying patterns that govern their performance. We adopted a mutagenesis screen approach inspired by the methods used in biological studies, to investigate Llama2-7b and Zephyr. This technique involved mutating elements within the models' matrices to their maximum or minimum values to examine the relationship between model parameters and their functionalities. Our research uncovered multiple levels of fine structures within both models. Many matrices showed a mixture of maximum and minimum mutations following mutagenesis, but others were predominantly sensitive to one type. Notably, mutations that produced phenotypes, especially those with severe outcomes, tended to cluster along axes. Additionally, the location of maximum and minimum mutations often displayed a complementary pattern on matrix in both models, with the Gate matrix showing a unique two-dimensional asymmetry after rearrangement. In Zephyr, certain mutations consistently resulted in poetic or conversational rather than descriptive outputs. These "writer" mutations grouped according to the high-frequency initial word of the output, with a marked tendency to share the row coordinate even when they are in different matrices. Our findings affirm that the mutagenesis screen is an effective tool for deciphering the complexities of large language models and identifying unexpected ways to expand their potential, providing deeper insights into the foundational aspects of AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11494v2-abstract-full').style.display = 'none'; document.getElementById('2408.11494v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, supplementary material available online</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11490">arXiv:2408.11490</a> <span> [<a href="https://arxiv.org/pdf/2408.11490">pdf</a>, <a href="https://arxiv.org/format/2408.11490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DocTabQA: Answering Questions from Long Documents Using Tables </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haochen Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Haoyu Dong</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Liangcai Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11490v1-abstract-short" style="display: inline;"> We study a new problem setting of question answering (QA), referred to as DocTabQA. Within this setting, given a long document, the goal is to respond to questions by organizing the answers into structured tables derived directly from the document's content. Unlike traditional QA approaches which predominantly rely on unstructured text to formulate responses, DocTabQA aims to leverage structured t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11490v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11490v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11490v1-abstract-full" style="display: none;"> We study a new problem setting of question answering (QA), referred to as DocTabQA. Within this setting, given a long document, the goal is to respond to questions by organizing the answers into structured tables derived directly from the document's content. Unlike traditional QA approaches which predominantly rely on unstructured text to formulate responses, DocTabQA aims to leverage structured tables as answers to convey information clearly and systematically, thereby enhancing user comprehension and highlighting relationships between data points. To the best of our knowledge, this problem has not been previously explored. In this paper, we introduce the QTabA dataset, encompassing 300 financial documents, accompanied by manually annotated 1.5k question-table pairs. Initially, we leverage Large Language Models (LLMs) such as GPT-4 to establish a baseline. However, it is widely acknowledged that LLMs encounter difficulties when tasked with generating intricate, structured outputs from long input sequences. To overcome these challenges, we present a two-stage framework, called DocTabTalk, which initially retrieves relevant sentences from extensive documents and subsequently generates hierarchical tables based on these identified sentences. DocTabTalk incorporates two key technological innovations: AlignLLaMA and TabTalk, which are specifically tailored to assist GPT-4 in tackling DocTabQA, enabling it to generate well-structured, hierarchical tables with improved organization and clarity. Comprehensive experimental evaluations conducted on both QTabA and RotoWire datasets demonstrate that our DocTabTalk significantly enhances the performances of the GPT-4 in our proposed DocTabQA task and the table generation task. The code and dataset are available at https://github.com/SmileWHC/DocTabQA for further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11490v1-abstract-full').style.display = 'none'; document.getElementById('2408.11490v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages,5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06030">arXiv:2408.06030</a> <span> [<a href="https://arxiv.org/pdf/2408.06030">pdf</a>, <a href="https://arxiv.org/format/2408.06030">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Developing Smart MAVs for Autonomous Inspection in GPS-denied Constructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+P">Paoqiang Pan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kewei Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiao Huang</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+W">Wei Ying</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiaoxuan Xie</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yue Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Naizhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+H">Hanwen Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06030v1-abstract-short" style="display: inline;"> Smart Micro Aerial Vehicles (MAVs) have transformed infrastructure inspection by enabling efficient, high-resolution monitoring at various stages of construction, including hard-to-reach areas. Traditional manual operation of drones in GPS-denied environments, such as industrial facilities and infrastructure, is labour-intensive, tedious and prone to error. This study presents an innovative framew… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06030v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06030v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06030v1-abstract-full" style="display: none;"> Smart Micro Aerial Vehicles (MAVs) have transformed infrastructure inspection by enabling efficient, high-resolution monitoring at various stages of construction, including hard-to-reach areas. Traditional manual operation of drones in GPS-denied environments, such as industrial facilities and infrastructure, is labour-intensive, tedious and prone to error. This study presents an innovative framework for smart MAV inspections in such complex and GPS-denied indoor environments. The framework features a hierarchical perception and planning system that identifies regions of interest and optimises task paths. It also presents an advanced MAV system with enhanced localisation and motion planning capabilities, integrated with Neural Reconstruction technology for comprehensive 3D reconstruction of building structures. The effectiveness of the framework was empirically validated in a 4,000 square meters indoor infrastructure facility with an interior length of 80 metres, a width of 50 metres and a height of 7 metres. The main structure consists of columns and walls. Experimental results show that our MAV system performs exceptionally well in autonomous inspection tasks, achieving a 100\% success rate in generating and executing scan paths. Extensive experiments validate the manoeuvrability of our developed MAV, achieving a 100\% success rate in motion planning with a tracking error of less than 0.1 metres. In addition, the enhanced reconstruction method using 3D Gaussian Splatting technology enables the generation of high-fidelity rendering models from the acquired data. Overall, our novel method represents a significant advancement in the use of robotics for infrastructure inspection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06030v1-abstract-full').style.display = 'none'; document.getElementById('2408.06030v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21381">arXiv:2407.21381</a> <span> [<a href="https://arxiv.org/pdf/2407.21381">pdf</a>, <a href="https://arxiv.org/format/2407.21381">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Identity-Consistent Diffusion Network for Grading Knee Osteoarthritis Progression in Radiographic Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenhua Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kun Hu</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+W">Wenxi Yue</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Simic%2C+M">Milena Simic</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changyang Li</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+W">Wei Xiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21381v1-abstract-short" style="display: inline;"> Knee osteoarthritis (KOA), a common form of arthritis that causes physical disability, has become increasingly prevalent in society. Employing computer-aided techniques to automatically assess the severity and progression of KOA can greatly benefit KOA treatment and disease management. Particularly, the advancement of X-ray technology in KOA demonstrates its potential for this purpose. Yet, existi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21381v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21381v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21381v1-abstract-full" style="display: none;"> Knee osteoarthritis (KOA), a common form of arthritis that causes physical disability, has become increasingly prevalent in society. Employing computer-aided techniques to automatically assess the severity and progression of KOA can greatly benefit KOA treatment and disease management. Particularly, the advancement of X-ray technology in KOA demonstrates its potential for this purpose. Yet, existing X-ray prognosis research generally yields a singular progression severity grade, overlooking the potential visual changes for understanding and explaining the progression outcome. Therefore, in this study, a novel generative model is proposed, namely Identity-Consistent Radiographic Diffusion Network (IC-RDN), for multifaceted KOA prognosis encompassing a predicted future knee X-ray scan conditioned on the baseline scan. Specifically, an identity prior module for the diffusion and a downstream generation-guided progression prediction module are introduced. Compared to conventional image-to-image generative models, identity priors regularize and guide the diffusion to focus more on the clinical nuances of the prognosis based on a contrastive learning strategy. The progression prediction module utilizes both forecasted and baseline knee scans, and a more comprehensive formulation of KOA severity progression grading is expected. Extensive experiments on a widely used public dataset, OAI, demonstrate the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21381v1-abstract-full').style.display = 'none'; document.getElementById('2407.21381v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19763">arXiv:2407.19763</a> <span> [<a href="https://arxiv.org/pdf/2407.19763">pdf</a>, <a href="https://arxiv.org/format/2407.19763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TeleOR: Real-time Telemedicine System for Full-Scene Operating Room </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yixuan Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kaiyuan Hu</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Q">Qian Shao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jintai Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D+Z">Danny Z. Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19763v1-abstract-short" style="display: inline;"> The advent of telemedicine represents a transformative development in leveraging technology to extend the reach of specialized medical expertise to remote surgeries, a field where the immediacy of expert guidance is paramount. However, the intricate dynamics of Operating Room (OR) scene pose unique challenges for telemedicine, particularly in achieving high-fidelity, real-time scene reconstruction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19763v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19763v1-abstract-full" style="display: none;"> The advent of telemedicine represents a transformative development in leveraging technology to extend the reach of specialized medical expertise to remote surgeries, a field where the immediacy of expert guidance is paramount. However, the intricate dynamics of Operating Room (OR) scene pose unique challenges for telemedicine, particularly in achieving high-fidelity, real-time scene reconstruction and transmission amidst obstructions and bandwidth limitations. This paper introduces TeleOR, a pioneering system designed to address these challenges through real-time OR scene reconstruction for Tele-intervention. TeleOR distinguishes itself with three innovative approaches: dynamic self-calibration, which leverages inherent scene features for calibration without the need for preset markers, allowing for obstacle avoidance and real-time camera adjustment; selective OR reconstruction, focusing on dynamically changing scene segments to reduce reconstruction complexity; and viewport-adaptive transmission, optimizing data transmission based on real-time client feedback to efficiently deliver high-quality 3D reconstructions within bandwidth constraints. Comprehensive experiments on the 4D-OR surgical scene dataset demostrate the superiority and applicability of TeleOR, illuminating the potential to revolutionize tele-interventions by overcoming the spatial and technical barriers inherent in remote surgical guidance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19763v1-abstract-full').style.display = 'none'; document.getElementById('2407.19763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19244">arXiv:2407.19244</a> <span> [<a href="https://arxiv.org/pdf/2407.19244">pdf</a>, <a href="https://arxiv.org/format/2407.19244">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICME57554.2024.10688347">10.1109/ICME57554.2024.10688347 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Radio Frequency Signal based Human Silhouette Segmentation: A Sequential Diffusion Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+P">Penghui Wen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kun Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+D">Dong Yuan</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Z">Zhiyuan Ning</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changyang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19244v1-abstract-short" style="display: inline;"> Radio frequency (RF) signals have been proved to be flexible for human silhouette segmentation (HSS) under complex environments. Existing studies are mainly based on a one-shot approach, which lacks a coherent projection ability from the RF domain. Additionally, the spatio-temporal patterns have not been fully explored for human motion dynamics in HSS. Therefore, we propose a two-stage Sequential… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19244v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19244v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19244v1-abstract-full" style="display: none;"> Radio frequency (RF) signals have been proved to be flexible for human silhouette segmentation (HSS) under complex environments. Existing studies are mainly based on a one-shot approach, which lacks a coherent projection ability from the RF domain. Additionally, the spatio-temporal patterns have not been fully explored for human motion dynamics in HSS. Therefore, we propose a two-stage Sequential Diffusion Model (SDM) to progressively synthesize high-quality segmentation jointly with the considerations on motion dynamics. Cross-view transformation blocks are devised to guide the diffusion model in a multi-scale manner for comprehensively characterizing human related patterns in an individual frame such as directional projection from signal planes. Moreover, spatio-temporal blocks are devised to fine-tune the frame-level model to incorporate spatio-temporal contexts and motion dynamics, enhancing the consistency of the segmentation maps. Comprehensive experiments on a public benchmark -- HIBER demonstrate the state-of-the-art performance of our method with an IoU 0.732. Our code is available at https://github.com/ph-w2000/SDM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19244v1-abstract-full').style.display = 'none'; document.getElementById('2407.19244v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12772">arXiv:2407.12772</a> <span> [<a href="https://arxiv.org/pdf/2407.12772">pdf</a>, <a href="https://arxiv.org/format/2407.12772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LMMs-Eval: Reality Check on the Evaluation of Large Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+F">Fanyi Pu</a>, <a href="/search/cs?searchtype=author&query=Cahyono%2C+J+A">Joshua Adrian Cahyono</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kairui Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuai Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanhan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingkang Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunyuan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12772v1-abstract-short" style="display: inline;"> The advances of large foundation models necessitate wide-coverage, low-cost, and zero-contamination benchmarks. Despite continuous exploration of language model evaluations, comprehensive studies on the evaluation of Large Multi-modal Models (LMMs) remain limited. In this work, we introduce LMMS-EVAL, a unified and standardized multimodal benchmark framework with over 50 tasks and more than 10 mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12772v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12772v1-abstract-full" style="display: none;"> The advances of large foundation models necessitate wide-coverage, low-cost, and zero-contamination benchmarks. Despite continuous exploration of language model evaluations, comprehensive studies on the evaluation of Large Multi-modal Models (LMMs) remain limited. In this work, we introduce LMMS-EVAL, a unified and standardized multimodal benchmark framework with over 50 tasks and more than 10 models to promote transparent and reproducible evaluations. Although LMMS-EVAL offers comprehensive coverage, we find it still falls short in achieving low cost and zero contamination. To approach this evaluation trilemma, we further introduce LMMS-EVAL LITE, a pruned evaluation toolkit that emphasizes both coverage and efficiency. Additionally, we present Multimodal LIVEBENCH that utilizes continuously updating news and online forums to assess models' generalization abilities in the wild, featuring a low-cost and zero-contamination evaluation approach. In summary, our work highlights the importance of considering the evaluation trilemma and provides practical solutions to navigate the trade-offs in evaluating large multi-modal models, paving the way for more effective and reliable benchmarking of LMMs. We opensource our codebase and maintain leaderboard of LIVEBENCH at https://github.com/EvolvingLMMs-Lab/lmms-eval and https://huggingface.co/spaces/lmms-lab/LiveBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12772v1-abstract-full').style.display = 'none'; document.getElementById('2407.12772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code ad leaderboard are available at https://github.com/EvolvingLMMs-Lab/lmms-eval and https://huggingface.co/spaces/lmms-lab/LiveBench</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11083">arXiv:2407.11083</a> <span> [<a href="https://arxiv.org/pdf/2407.11083">pdf</a>, <a href="https://arxiv.org/format/2407.11083">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Empowering Graph Invariance Learning with Deep Spurious Infomax </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+T">Tianjun Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongqiang Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenhao Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhiqiang Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11083v1-abstract-short" style="display: inline;"> Recently, there has been a surge of interest in developing graph neural networks that utilize the invariance principle on graphs to generalize the out-of-distribution (OOD) data. Due to the limited knowledge about OOD data, existing approaches often pose assumptions about the correlation strengths of the underlying spurious features and the target labels. However, this prior is often unavailable a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11083v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11083v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11083v1-abstract-full" style="display: none;"> Recently, there has been a surge of interest in developing graph neural networks that utilize the invariance principle on graphs to generalize the out-of-distribution (OOD) data. Due to the limited knowledge about OOD data, existing approaches often pose assumptions about the correlation strengths of the underlying spurious features and the target labels. However, this prior is often unavailable and will change arbitrarily in the real-world scenarios, which may lead to severe failures of the existing graph invariance learning methods. To bridge this gap, we introduce a novel graph invariance learning paradigm, which induces a robust and general inductive bias. The paradigm is built upon the observation that the infomax principle encourages learning spurious features regardless of spurious correlation strengths. We further propose the EQuAD framework that realizes this learning paradigm and employs tailored learning objectives that provably elicit invariant features by disentangling them from the spurious features learned through infomax. Notably, EQuAD shows stable and enhanced performance across different degrees of bias in synthetic datasets and challenging real-world datasets up to $31.76\%$. Our code is available at \url{https://github.com/tianyao-aka/EQuAD}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11083v1-abstract-full').style.display = 'none'; document.getElementById('2407.11083v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML2024 camera-ready version</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10973">arXiv:2407.10973</a> <span> [<a href="https://arxiv.org/pdf/2407.10973">pdf</a>, <a href="https://arxiv.org/format/2407.10973">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Make-An-Agent: A Generalizable Policy Network Generator with Behavior-Prompted Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yongyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tingqiang Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kaizhe Hu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+G">Guangqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Furong Huang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Huazhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10973v3-abstract-short" style="display: inline;"> Can we generate a control policy for an agent using just one demonstration of desired behaviors as a prompt, as effortlessly as creating an image from a textual description? In this paper, we present Make-An-Agent, a novel policy parameter generator that leverages the power of conditional diffusion models for behavior-to-policy generation. Guided by behavior embeddings that encode trajectory infor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10973v3-abstract-full').style.display = 'inline'; document.getElementById('2407.10973v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10973v3-abstract-full" style="display: none;"> Can we generate a control policy for an agent using just one demonstration of desired behaviors as a prompt, as effortlessly as creating an image from a textual description? In this paper, we present Make-An-Agent, a novel policy parameter generator that leverages the power of conditional diffusion models for behavior-to-policy generation. Guided by behavior embeddings that encode trajectory information, our policy generator synthesizes latent parameter representations, which can then be decoded into policy networks. Trained on policy network checkpoints and their corresponding trajectories, our generation model demonstrates remarkable versatility and scalability on multiple tasks and has a strong generalization ability on unseen tasks to output well-performed policies with only few-shot demonstrations as inputs. We showcase its efficacy and efficiency on various domains and tasks, including varying objectives, behaviors, and even across different robot manipulators. Beyond simulation, we directly deploy policies generated by Make-An-Agent onto real-world robots on locomotion tasks. Project page: https://cheryyunl.github.io/make-an-agent/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10973v3-abstract-full').style.display = 'none'; document.getElementById('2407.10973v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Annual Conference on Neural Information Processing Systems 38</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05407">arXiv:2407.05407</a> <span> [<a href="https://arxiv.org/pdf/2407.05407">pdf</a>, <a href="https://arxiv.org/format/2407.05407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yexin Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05407v2-abstract-short" style="display: inline;"> Recent years have witnessed a trend that large language model (LLM) based text-to-speech (TTS) emerges into the mainstream due to their high naturalness and zero-shot capacity. In this paradigm, speech signals are discretized into token sequences, which are modeled by an LLM with text as prompts and reconstructed by a token-based vocoder to waveforms. Obviously, speech tokens play a critical role… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05407v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05407v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05407v2-abstract-full" style="display: none;"> Recent years have witnessed a trend that large language model (LLM) based text-to-speech (TTS) emerges into the mainstream due to their high naturalness and zero-shot capacity. In this paradigm, speech signals are discretized into token sequences, which are modeled by an LLM with text as prompts and reconstructed by a token-based vocoder to waveforms. Obviously, speech tokens play a critical role in LLM-based TTS models. Current speech tokens are learned in an unsupervised manner, which lacks explicit semantic information and alignment to the text. In this paper, we propose to represent speech with supervised semantic tokens, which are derived from a multilingual speech recognition model by inserting vector quantization into the encoder. Based on the tokens, we further propose a scalable zero-shot TTS synthesizer, CosyVoice, which consists of an LLM for text-to-token generation and a conditional flow matching model for token-to-speech synthesis. Experimental results show that supervised semantic tokens significantly outperform existing unsupervised tokens in terms of content consistency and speaker similarity for zero-shot voice cloning. Moreover, we find that utilizing large-scale data further improves the synthesis performance, indicating the scalable capacity of CosyVoice. To the best of our knowledge, this is the first attempt to involve supervised speech tokens into TTS models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05407v2-abstract-full').style.display = 'none'; document.getElementById('2407.05407v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">work in progress. arXiv admin note: substantial text overlap with arXiv:2407.04051</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04051">arXiv:2407.04051</a> <span> [<a href="https://arxiv.org/pdf/2407.04051">pdf</a>, <a href="https://arxiv.org/format/2407.04051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+K">Keyu An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Ting He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yabin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zerui Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Bin Ma</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/cs?searchtype=author&query=Song%2C+C">Changhe Song</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiaqi Shi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04051v3-abstract-short" style="display: inline;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'inline'; document.getElementById('2407.04051v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04051v3-abstract-full" style="display: none;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, speaking style, and speaker identity. SenseVoice-Small delivers exceptionally low-latency ASR for 5 languages, and SenseVoice-Large supports high-precision ASR for over 50 languages, while CosyVoice excels in multi-lingual voice generation, zero-shot in-context learning, cross-lingual voice cloning, and instruction-following capabilities. The models related to SenseVoice and CosyVoice have been open-sourced on Modelscope and Huggingface, along with the corresponding training, inference, and fine-tuning codes released on GitHub. By integrating these models with LLMs, FunAudioLLM enables applications such as speech-to-speech translation, emotional voice chat, interactive podcasts, and expressive audiobook narration, thereby pushing the boundaries of voice interaction technology. Demos are available at https://fun-audio-llm.github.io, and the code can be accessed at https://github.com/FunAudioLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'none'; document.getElementById('2407.04051v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20494">arXiv:2405.20494</a> <span> [<a href="https://arxiv.org/pdf/2405.20494">pdf</a>, <a href="https://arxiv.org/format/2405.20494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Slight Corruption in Pre-training Data Makes Better Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yujin Han</a>, <a href="/search/cs?searchtype=author&query=Misra%2C+D">Diganta Misra</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+D">Difan Zou</a>, <a href="/search/cs?searchtype=author&query=Sugiyama%2C+M">Masashi Sugiyama</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&query=Raj%2C+B">Bhiksha Raj</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20494v2-abstract-short" style="display: inline;"> Diffusion models (DMs) have shown remarkable capabilities in generating realistic high-quality images, audios, and videos. They benefit significantly from extensive pre-training on large-scale datasets, including web-crawled data with paired data and conditions, such as image-text and image-class pairs. Despite rigorous filtering, these pre-training datasets often inevitably contain corrupted pair… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20494v2-abstract-full').style.display = 'inline'; document.getElementById('2405.20494v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20494v2-abstract-full" style="display: none;"> Diffusion models (DMs) have shown remarkable capabilities in generating realistic high-quality images, audios, and videos. They benefit significantly from extensive pre-training on large-scale datasets, including web-crawled data with paired data and conditions, such as image-text and image-class pairs. Despite rigorous filtering, these pre-training datasets often inevitably contain corrupted pairs where conditions do not accurately describe the data. This paper presents the first comprehensive study on the impact of such corruption in pre-training data of DMs. We synthetically corrupt ImageNet-1K and CC3M to pre-train and evaluate over 50 conditional DMs. Our empirical findings reveal that various types of slight corruption in pre-training can significantly enhance the quality, diversity, and fidelity of the generated images across different DMs, both during pre-training and downstream adaptation stages. Theoretically, we consider a Gaussian mixture model and prove that slight corruption in the condition leads to higher entropy and a reduced 2-Wasserstein distance to the ground truth of the data distribution generated by the corruptly trained DMs. Inspired by our analysis, we propose a simple method to improve the training of DMs on practical datasets by adding condition embedding perturbations (CEP). CEP significantly improves the performance of various DMs in both pre-training and downstream tasks. We hope that our study provides new insights into understanding the data and pre-training processes of DMs and all models are released at https://huggingface.co/DiffusionNoise. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20494v2-abstract-full').style.display = 'none'; document.getElementById('2405.20494v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Spotlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19854">arXiv:2405.19854</a> <span> [<a href="https://arxiv.org/pdf/2405.19854">pdf</a>, <a href="https://arxiv.org/format/2405.19854">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RTGen: Generating Region-Text Pairs for Open-Vocabulary Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+F">Fangyi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhantao Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Savvides%2C+M">Marios Savvides</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19854v1-abstract-short" style="display: inline;"> Open-vocabulary object detection (OVD) requires solid modeling of the region-semantic relationship, which could be learned from massive region-text pairs. However, such data is limited in practice due to significant annotation costs. In this work, we propose RTGen to generate scalable open-vocabulary region-text pairs and demonstrate its capability to boost the performance of open-vocabulary objec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19854v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19854v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19854v1-abstract-full" style="display: none;"> Open-vocabulary object detection (OVD) requires solid modeling of the region-semantic relationship, which could be learned from massive region-text pairs. However, such data is limited in practice due to significant annotation costs. In this work, we propose RTGen to generate scalable open-vocabulary region-text pairs and demonstrate its capability to boost the performance of open-vocabulary object detection. RTGen includes both text-to-region and region-to-text generation processes on scalable image-caption data. The text-to-region generation is powered by image inpainting, directed by our proposed scene-aware inpainting guider for overall layout harmony. For region-to-text generation, we perform multiple region-level image captioning with various prompts and select the best matching text according to CLIP similarity. To facilitate detection training on region-text pairs, we also introduce a localization-aware region-text contrastive loss that learns object proposals tailored with different localization qualities. Extensive experiments demonstrate that our RTGen can serve as a scalable, semantically rich, and effective source for open-vocabulary object detection and continue to improve the model performance when more data is utilized, delivering superior performance compared to the existing state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19854v1-abstract-full').style.display = 'none'; document.getElementById('2405.19854v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17503">arXiv:2405.17503</a> <span> [<a href="https://arxiv.org/pdf/2405.17503">pdf</a>, <a href="https://arxiv.org/format/2405.17503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Code Repair with LLMs gives an Exploration-Exploitation Tradeoff </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hao Tang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Keya Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J+P">Jin Peng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sicheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wei-Long Zheng</a>, <a href="/search/cs?searchtype=author&query=Si%2C+X">Xujie Si</a>, <a href="/search/cs?searchtype=author&query=Ellis%2C+K">Kevin Ellis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17503v3-abstract-short" style="display: inline;"> Iteratively improving and repairing source code with large language models (LLMs), known as refinement, has emerged as a popular way of generating programs that would be too complex to construct in one shot. Given a bank of test cases, together with a candidate program, an LLM can improve that program by being prompted with failed test cases. But it remains an open question how to best iteratively… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17503v3-abstract-full').style.display = 'inline'; document.getElementById('2405.17503v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17503v3-abstract-full" style="display: none;"> Iteratively improving and repairing source code with large language models (LLMs), known as refinement, has emerged as a popular way of generating programs that would be too complex to construct in one shot. Given a bank of test cases, together with a candidate program, an LLM can improve that program by being prompted with failed test cases. But it remains an open question how to best iteratively refine code, with prior work employing simple greedy or breadth-first strategies. We show here that refinement exposes an explore-exploit tradeoff: exploit by refining the program that passes the most test cases, or explore by refining a lesser considered program. We frame this as an arm-acquiring bandit problem, which we solve with Thompson Sampling. The resulting LLM-based program synthesis algorithm is broadly applicable: Across loop invariant synthesis, visual reasoning puzzles, and competition programming problems, we find that our new method can solve more problems using fewer language model calls. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17503v3-abstract-full').style.display = 'none'; document.getElementById('2405.17503v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17358">arXiv:2405.17358</a> <span> [<a href="https://arxiv.org/pdf/2405.17358">pdf</a>, <a href="https://arxiv.org/format/2405.17358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Transformers in Solving POMDPs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chenhao Lu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+R">Ruizhe Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuyao Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kaizhe Hu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+S+S">Simon S. Du</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Huazhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17358v3-abstract-short" style="display: inline;"> Sequential decision-making algorithms such as reinforcement learning (RL) in real-world scenarios inevitably face environments with partial observability. This paper scrutinizes the effectiveness of a popular architecture, namely Transformers, in Partially Observable Markov Decision Processes (POMDPs) and reveals its theoretical limitations. We establish that regular languages, which Transformers… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17358v3-abstract-full').style.display = 'inline'; document.getElementById('2405.17358v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17358v3-abstract-full" style="display: none;"> Sequential decision-making algorithms such as reinforcement learning (RL) in real-world scenarios inevitably face environments with partial observability. This paper scrutinizes the effectiveness of a popular architecture, namely Transformers, in Partially Observable Markov Decision Processes (POMDPs) and reveals its theoretical limitations. We establish that regular languages, which Transformers struggle to model, are reducible to POMDPs. This poses a significant challenge for Transformers in learning POMDP-specific inductive biases, due to their lack of inherent recurrence found in other models like RNNs. This paper casts doubt on the prevalent belief in Transformers as sequence models for RL and proposes to introduce a point-wise recurrent structure. The Deep Linear Recurrent Unit (LRU) emerges as a well-suited alternative for Partially Observable RL, with empirical results highlighting the sub-optimal performance of the Transformer and considerable strength of LRU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17358v3-abstract-full').style.display = 'none'; document.getElementById('2405.17358v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2024; references added; typos fixed</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16173">arXiv:2405.16173</a> <span> [<a href="https://arxiv.org/pdf/2405.16173">pdf</a>, <a href="https://arxiv.org/format/2405.16173">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Diffusion-based Reinforcement Learning via Q-weighted Variational Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shutong Ding</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+K">Kan Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jingyi Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingya Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Ye Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16173v2-abstract-short" style="display: inline;"> Diffusion models have garnered widespread attention in Reinforcement Learning (RL) for their powerful expressiveness and multimodality. It has been verified that utilizing diffusion policies can significantly improve the performance of RL algorithms in continuous control tasks by overcoming the limitations of unimodal policies, such as Gaussian policies, and providing the agent with enhanced explo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16173v2-abstract-full').style.display = 'inline'; document.getElementById('2405.16173v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16173v2-abstract-full" style="display: none;"> Diffusion models have garnered widespread attention in Reinforcement Learning (RL) for their powerful expressiveness and multimodality. It has been verified that utilizing diffusion policies can significantly improve the performance of RL algorithms in continuous control tasks by overcoming the limitations of unimodal policies, such as Gaussian policies, and providing the agent with enhanced exploration capabilities. However, existing works mainly focus on the application of diffusion policies in offline RL, while their incorporation into online RL is less investigated. The training objective of the diffusion model, known as the variational lower bound, cannot be optimized directly in online RL due to the unavailability of 'good' actions. This leads to difficulties in conducting diffusion policy improvement. To overcome this, we propose a novel model-free diffusion-based online RL algorithm, Q-weighted Variational Policy Optimization (QVPO). Specifically, we introduce the Q-weighted variational loss, which can be proved to be a tight lower bound of the policy objective in online RL under certain conditions. To fulfill these conditions, the Q-weight transformation functions are introduced for general scenarios. Additionally, to further enhance the exploration capability of the diffusion policy, we design a special entropy regularization term. We also develop an efficient behavior policy to enhance sample efficiency by reducing the variance of the diffusion policy during online interactions. Consequently, the QVPO algorithm leverages the exploration capabilities and multimodality of diffusion policies, preventing the RL agent from converging to a sub-optimal policy. To verify the effectiveness of QVPO, we conduct comprehensive experiments on MuJoCo benchmarks. The final results demonstrate that QVPO achieves state-of-the-art performance on both cumulative reward and sample efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16173v2-abstract-full').style.display = 'none'; document.getElementById('2405.16173v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15287">arXiv:2405.15287</a> <span> [<a href="https://arxiv.org/pdf/2405.15287">pdf</a>, <a href="https://arxiv.org/format/2405.15287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ArtWeaver: Advanced Dynamic Style Integration via Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengming Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qilin Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Donghao Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengjie Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15287v2-abstract-short" style="display: inline;"> Stylized Text-to-Image Generation (STIG) aims to generate images from text prompts and style reference images. In this paper, we present ArtWeaver, a novel framework that leverages pretrained Stable Diffusion (SD) to address challenges such as misinterpreted styles and inconsistent semantics. Our approach introduces two innovative modules: the mixed style descriptor and the dynamic attention adapt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15287v2-abstract-full').style.display = 'inline'; document.getElementById('2405.15287v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15287v2-abstract-full" style="display: none;"> Stylized Text-to-Image Generation (STIG) aims to generate images from text prompts and style reference images. In this paper, we present ArtWeaver, a novel framework that leverages pretrained Stable Diffusion (SD) to address challenges such as misinterpreted styles and inconsistent semantics. Our approach introduces two innovative modules: the mixed style descriptor and the dynamic attention adapter. The mixed style descriptor enhances SD by combining content-aware and frequency-disentangled embeddings from CLIP with additional sources that capture global statistics and textual information, thus providing a richer blend of style-related and semantic-related knowledge. To achieve a better balance between adapter capacity and semantic control, the dynamic attention adapter is integrated into the diffusion UNet, dynamically calculating adaptation weights based on the style descriptors. Additionally, we introduce two objective functions to optimize the model alongside the denoising loss, further enhancing semantic and style consistency. Extensive experiments demonstrate the superiority of ArtWeaver over existing methods, producing images with diverse target styles while maintaining the semantic integrity of the text prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15287v2-abstract-full').style.display = 'none'; document.getElementById('2405.15287v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.11757">arXiv:2405.11757</a> <span> [<a href="https://arxiv.org/pdf/2405.11757">pdf</a>, <a href="https://arxiv.org/format/2405.11757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DLAFormer: An End-to-End Transformer For Document Layout Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiawei Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Q">Qiang Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.11757v1-abstract-short" style="display: inline;"> Document layout analysis (DLA) is crucial for understanding the physical layout and logical structure of documents, serving information retrieval, document summarization, knowledge extraction, etc. However, previous studies have typically used separate models to address individual sub-tasks within DLA, including table/figure detection, text region detection, logical role classification, and readin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11757v1-abstract-full').style.display = 'inline'; document.getElementById('2405.11757v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.11757v1-abstract-full" style="display: none;"> Document layout analysis (DLA) is crucial for understanding the physical layout and logical structure of documents, serving information retrieval, document summarization, knowledge extraction, etc. However, previous studies have typically used separate models to address individual sub-tasks within DLA, including table/figure detection, text region detection, logical role classification, and reading order prediction. In this work, we propose an end-to-end transformer-based approach for document layout analysis, called DLAFormer, which integrates all these sub-tasks into a single model. To achieve this, we treat various DLA sub-tasks (such as text region detection, logical role classification, and reading order prediction) as relation prediction problems and consolidate these relation prediction labels into a unified label space, allowing a unified relation prediction module to handle multiple tasks concurrently. Additionally, we introduce a novel set of type-wise queries to enhance the physical meaning of content queries in DETR. Moreover, we adopt a coarse-to-fine strategy to accurately identify graphical page objects. Experimental results demonstrate that our proposed DLAFormer outperforms previous approaches that employ multi-branch or multi-stage architectures for multiple tasks on two document layout analysis benchmarks, DocLayNet and Comp-HRDoc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11757v1-abstract-full').style.display = 'none'; document.getElementById('2405.11757v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICDAR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09113">arXiv:2405.09113</a> <span> [<a href="https://arxiv.org/pdf/2405.09113">pdf</a>, <a href="https://arxiv.org/ps/2405.09113">ps</a>, <a href="https://arxiv.org/format/2405.09113">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient LLM Jailbreak via Adaptive Dense-to-sparse Constrained Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weichen Yu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+T">Tianjun Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenhe Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lijun Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yining Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhiqiang Shen</a>, <a href="/search/cs?searchtype=author&query=Fredrikson%2C+M">Matt Fredrikson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09113v1-abstract-short" style="display: inline;"> Recent research indicates that large language models (LLMs) are susceptible to jailbreaking attacks that can generate harmful content. This paper introduces a novel token-level attack method, Adaptive Dense-to-Sparse Constrained Optimization (ADC), which effectively jailbreaks several open-source LLMs. Our approach relaxes the discrete jailbreak optimization into a continuous optimization and prog… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09113v1-abstract-full').style.display = 'inline'; document.getElementById('2405.09113v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09113v1-abstract-full" style="display: none;"> Recent research indicates that large language models (LLMs) are susceptible to jailbreaking attacks that can generate harmful content. This paper introduces a novel token-level attack method, Adaptive Dense-to-Sparse Constrained Optimization (ADC), which effectively jailbreaks several open-source LLMs. Our approach relaxes the discrete jailbreak optimization into a continuous optimization and progressively increases the sparsity of the optimizing vectors. Consequently, our method effectively bridges the gap between discrete and continuous space optimization. Experimental results demonstrate that our method is more effective and efficient than existing token-level methods. On Harmbench, our method achieves state of the art attack success rate on seven out of eight LLMs. Code will be made available. Trigger Warning: This paper contains model behavior that can be offensive in nature. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09113v1-abstract-full').style.display = 'none'; document.getElementById('2405.09113v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.07444">arXiv:2405.07444</a> <span> [<a href="https://arxiv.org/pdf/2405.07444">pdf</a>, <a href="https://arxiv.org/format/2405.07444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Motion Keyframe Interpolation for Any Human Skeleton via Temporally Consistent Point Cloud Sampling and Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mo%2C+C">Clinton Mo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kun Hu</a>, <a href="/search/cs?searchtype=author&query=Long%2C+C">Chengjiang Long</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+D">Dong Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.07444v1-abstract-short" style="display: inline;"> In the character animation field, modern supervised keyframe interpolation models have demonstrated exceptional performance in constructing natural human motions from sparse pose definitions. As supervised models, large motion datasets are necessary to facilitate the learning process; however, since motion is represented with fixed hierarchical skeletons, such datasets are incompatible for skeleto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07444v1-abstract-full').style.display = 'inline'; document.getElementById('2405.07444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.07444v1-abstract-full" style="display: none;"> In the character animation field, modern supervised keyframe interpolation models have demonstrated exceptional performance in constructing natural human motions from sparse pose definitions. As supervised models, large motion datasets are necessary to facilitate the learning process; however, since motion is represented with fixed hierarchical skeletons, such datasets are incompatible for skeletons outside the datasets' native configurations. Consequently, the expected availability of a motion dataset for desired skeletons severely hinders the feasibility of learned interpolation in practice. To combat this limitation, we propose Point Cloud-based Motion Representation Learning (PC-MRL), an unsupervised approach to enabling cross-compatibility between skeletons for motion interpolation learning. PC-MRL consists of a skeleton obfuscation strategy using temporal point cloud sampling, and an unsupervised skeleton reconstruction method from point clouds. We devise a temporal point-wise K-nearest neighbors loss for unsupervised learning. Moreover, we propose First-frame Offset Quaternion (FOQ) and Rest Pose Augmentation (RPA) strategies to overcome necessary limitations of our unsupervised point cloud-to-skeletal motion process. Comprehensive experiments demonstrate the effectiveness of PC-MRL in motion interpolation for desired skeletons without supervision from native datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07444v1-abstract-full').style.display = 'none'; document.getElementById('2405.07444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16821">arXiv:2404.16821</a> <span> [<a href="https://arxiv.org/pdf/2404.16821">pdf</a>, <a href="https://arxiv.org/format/2404.16821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shenglong Ye</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+E">Erfei Cui</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+W">Wenwen Tong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kongzhi Hu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiapeng Luo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zheng Ma</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Ji Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaoyi Dong</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hang Yan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hewei Guo</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhenjiang Jin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xingjian Wei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+P">Pinlong Cai</a> , et al. (10 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16821v2-abstract-short" style="display: inline;"> In this report, we introduce InternVL 1.5, an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. We introduce three simple improvements: (1) Strong Vision Encoder: we explored a continuous learning strategy for the large-scale vision foundation model -- InternViT-6B, boosting its visual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16821v2-abstract-full').style.display = 'inline'; document.getElementById('2404.16821v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16821v2-abstract-full" style="display: none;"> In this report, we introduce InternVL 1.5, an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. We introduce three simple improvements: (1) Strong Vision Encoder: we explored a continuous learning strategy for the large-scale vision foundation model -- InternViT-6B, boosting its visual understanding capabilities, and making it can be transferred and reused in different LLMs. (2) Dynamic High-Resolution: we divide images into tiles ranging from 1 to 40 of 448$\times$448 pixels according to the aspect ratio and resolution of the input images, which supports up to 4K resolution input. (3) High-Quality Bilingual Dataset: we carefully collected a high-quality bilingual dataset that covers common scenes, document images, and annotated them with English and Chinese question-answer pairs, significantly enhancing performance in OCR- and Chinese-related tasks. We evaluate InternVL 1.5 through a series of benchmarks and comparative studies. Compared to both open-source and proprietary models, InternVL 1.5 shows competitive performance, achieving state-of-the-art results in 8 of 18 benchmarks. Code has been released at https://github.com/OpenGVLab/InternVL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16821v2-abstract-full').style.display = 'none'; document.getElementById('2404.16821v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06756">arXiv:2404.06756</a> <span> [<a href="https://arxiv.org/pdf/2404.06756">pdf</a>, <a href="https://arxiv.org/format/2404.06756">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CrimeAlarm: Towards Intensive Intent Dynamics in Fine-grained Crime Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kaixi Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lin Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Q">Qing Xie</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xiaohui Tao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guandong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06756v1-abstract-short" style="display: inline;"> Granularity and accuracy are two crucial factors for crime event prediction. Within fine-grained event classification, multiple criminal intents may alternately exhibit in preceding sequential events, and progress differently in next. Such intensive intent dynamics makes training models hard to capture unobserved intents, and thus leads to sub-optimal generalization performance, especially in the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06756v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06756v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06756v1-abstract-full" style="display: none;"> Granularity and accuracy are two crucial factors for crime event prediction. Within fine-grained event classification, multiple criminal intents may alternately exhibit in preceding sequential events, and progress differently in next. Such intensive intent dynamics makes training models hard to capture unobserved intents, and thus leads to sub-optimal generalization performance, especially in the intertwining of numerous potential events. To capture comprehensive criminal intents, this paper proposes a fine-grained sequential crime prediction framework, CrimeAlarm, that equips with a novel mutual distillation strategy inspired by curriculum learning. During the early training phase, spot-shared criminal intents are captured through high-confidence sequence samples. In the later phase, spot-specific intents are gradually learned by increasing the contribution of low-confidence sequences. Meanwhile, the output probability distributions are reciprocally learned between prediction networks to model unobserved criminal intents. Extensive experiments show that CrimeAlarm outperforms state-of-the-art methods in terms of NDCG@5, with improvements of 4.51% for the NYC16 and 7.73% for the CHI18 in accuracy measures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06756v1-abstract-full').style.display = 'none'; document.getElementById('2404.06756v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by DASFAA 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00364">arXiv:2404.00364</a> <span> [<a href="https://arxiv.org/pdf/2404.00364">pdf</a>, <a href="https://arxiv.org/format/2404.00364">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Accurate Cutting-point Estimation for Robotic Lychee Harvesting through Geometry-aware Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gengming Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hao Cao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kewei Hu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yaoqiang Pan</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yuqin Deng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongjun Wang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+H">Hanwen Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00364v1-abstract-short" style="display: inline;"> Accurately identifying lychee-picking points in unstructured orchard environments and obtaining their coordinate locations is critical to the success of lychee-picking robots. However, traditional two-dimensional (2D) image-based object detection methods often struggle due to the complex geometric structures of branches, leaves and fruits, leading to incorrect determination of lychee picking point… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00364v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00364v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00364v1-abstract-full" style="display: none;"> Accurately identifying lychee-picking points in unstructured orchard environments and obtaining their coordinate locations is critical to the success of lychee-picking robots. However, traditional two-dimensional (2D) image-based object detection methods often struggle due to the complex geometric structures of branches, leaves and fruits, leading to incorrect determination of lychee picking points. In this study, we propose a Fcaf3d-lychee network model specifically designed for the accurate localisation of lychee picking points. Point cloud data of lychee picking points in natural environments are acquired using Microsoft's Azure Kinect DK time-of-flight (TOF) camera through multi-view stitching. We augment the Fully Convolutional Anchor-Free 3D Object Detection (Fcaf3d) model with a squeeze-and-excitation(SE) module, which exploits human visual attention mechanisms for improved feature extraction of lychee picking points. The trained network model is evaluated on a test set of lychee-picking locations and achieves an impressive F1 score of 88.57%, significantly outperforming existing models. Subsequent three-dimensional (3D) position detection of picking points in real lychee orchard environments yields high accuracy, even under varying degrees of occlusion. Localisation errors of lychee picking points are within 1.5 cm in all directions, demonstrating the robustness and generality of the model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00364v1-abstract-full').style.display = 'none'; document.getElementById('2404.00364v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.16124">arXiv:2403.16124</a> <span> [<a href="https://arxiv.org/pdf/2403.16124">pdf</a>, <a href="https://arxiv.org/format/2403.16124">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Visual Continual Learning with Language-Guided Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ni%2C+B">Bolin Ni</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenghao Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+G">Gaofeng Meng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+S">Shiming Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.16124v1-abstract-short" style="display: inline;"> Continual learning (CL) aims to empower models to learn new tasks without forgetting previously acquired knowledge. Most prior works concentrate on the techniques of architectures, replay data, regularization, \etc. However, the category name of each class is largely neglected. Existing methods commonly utilize the one-hot labels and randomly initialize the classifier head. We argue that the scarc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16124v1-abstract-full').style.display = 'inline'; document.getElementById('2403.16124v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.16124v1-abstract-full" style="display: none;"> Continual learning (CL) aims to empower models to learn new tasks without forgetting previously acquired knowledge. Most prior works concentrate on the techniques of architectures, replay data, regularization, \etc. However, the category name of each class is largely neglected. Existing methods commonly utilize the one-hot labels and randomly initialize the classifier head. We argue that the scarce semantic information conveyed by the one-hot labels hampers the effective knowledge transfer across tasks. In this paper, we revisit the role of the classifier head within the CL paradigm and replace the classifier with semantic knowledge from pretrained language models (PLMs). Specifically, we use PLMs to generate semantic targets for each class, which are frozen and serve as supervision signals during training. Such targets fully consider the semantic correlation between all classes across tasks. Empirical studies show that our approach mitigates forgetting by alleviating representation drifting and facilitating knowledge transfer across tasks. The proposed method is simple to implement and can seamlessly be plugged into existing methods with negligible adjustments. Extensive experiments based on eleven mainstream baselines demonstrate the effectiveness and generalizability of our approach to various protocols. For example, under the class-incremental learning setting on ImageNet-100, our method significantly improves the Top-1 accuracy by 3.2\% to 6.1\% while reducing the forgetting rate by 2.6\% to 13.1\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16124v1-abstract-full').style.display = 'none'; document.getElementById('2403.16124v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15981">arXiv:2403.15981</a> <span> [<a href="https://arxiv.org/pdf/2403.15981">pdf</a>, <a href="https://arxiv.org/format/2403.15981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Accurate 3D Phenotyping in Greenhouse through Neural Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junhong Zhao</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+W">Wei Ying</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yaoqiang Pan</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+Z">Zhenfeng Yi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chao Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kewei Hu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+H">Hanwen Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15981v2-abstract-short" style="display: inline;"> Accurate collection of plant phenotyping is critical to optimising sustainable farming practices in precision agriculture. Traditional phenotyping in controlled laboratory environments, while valuable, falls short in understanding plant growth under real-world conditions. Emerging sensor and digital technologies offer a promising approach for direct phenotyping of plants in farm environments. This… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15981v2-abstract-full').style.display = 'inline'; document.getElementById('2403.15981v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15981v2-abstract-full" style="display: none;"> Accurate collection of plant phenotyping is critical to optimising sustainable farming practices in precision agriculture. Traditional phenotyping in controlled laboratory environments, while valuable, falls short in understanding plant growth under real-world conditions. Emerging sensor and digital technologies offer a promising approach for direct phenotyping of plants in farm environments. This study investigates a learning-based phenotyping method using the Neural Radiance Field to achieve accurate in-situ phenotyping of pepper plants in greenhouse environments. To quantitatively evaluate the performance of this method, traditional point cloud registration on 3D scanning data is implemented for comparison. Experimental result shows that NeRF(Neural Radiance Fields) achieves competitive accuracy compared to the 3D scanning methods. The mean distance error between the scanner-based method and the NeRF-based method is 0.865mm. This study shows that the learning-based NeRF method achieves similar accuracy to 3D scanning-based methods but with improved scalability and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15981v2-abstract-full').style.display = 'none'; document.getElementById('2403.15981v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.09685">arXiv:2402.09685</a> <span> [<a href="https://arxiv.org/pdf/2402.09685">pdf</a>, <a href="https://arxiv.org/format/2402.09685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Pheno-Robot: An Auto-Digital Modelling System for In-Situ Phenotyping in the Field </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yaoqiang Pan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kewei Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianhao Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chao Chen</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+H">Hanwen Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.09685v1-abstract-short" style="display: inline;"> Accurate reconstruction of plant models for phenotyping analysis is critical for optimising sustainable agricultural practices in precision agriculture. Traditional laboratory-based phenotyping, while valuable, falls short of understanding how plants grow under uncontrolled conditions. Robotic technologies offer a promising avenue for large-scale, direct phenotyping in real-world environments. Thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09685v1-abstract-full').style.display = 'inline'; document.getElementById('2402.09685v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.09685v1-abstract-full" style="display: none;"> Accurate reconstruction of plant models for phenotyping analysis is critical for optimising sustainable agricultural practices in precision agriculture. Traditional laboratory-based phenotyping, while valuable, falls short of understanding how plants grow under uncontrolled conditions. Robotic technologies offer a promising avenue for large-scale, direct phenotyping in real-world environments. This study explores the deployment of emerging robotics and digital technology in plant phenotyping to improve performance and efficiency. Three critical functional modules: environmental understanding, robotic motion planning, and in-situ phenotyping, are introduced to automate the entire process. Experimental results demonstrate the effectiveness of the system in agricultural environments. The pheno-robot system autonomously collects high-quality data by navigating around plants. In addition, the in-situ modelling model reconstructs high-quality plant models from the data collected by the robot. The developed robotic system shows high efficiency and robustness, demonstrating its potential to advance plant science in real-world agricultural environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09685v1-abstract-full').style.display = 'none'; document.getElementById('2402.09685v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.03093">arXiv:2402.03093</a> <span> [<a href="https://arxiv.org/pdf/2402.03093">pdf</a>, <a href="https://arxiv.org/format/2402.03093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> AI-Enhanced Virtual Reality in Medicine: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yixuan Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kaiyuan Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D+Z">Danny Z. Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.03093v3-abstract-short" style="display: inline;"> With the rapid advance of computer graphics and artificial intelligence technologies, the ways we interact with the world have undergone a transformative shift. Virtual Reality (VR) technology, aided by artificial intelligence (AI), has emerged as a dominant interaction media in multiple application areas, thanks to its advantage of providing users with immersive experiences. Among those applicati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03093v3-abstract-full').style.display = 'inline'; document.getElementById('2402.03093v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.03093v3-abstract-full" style="display: none;"> With the rapid advance of computer graphics and artificial intelligence technologies, the ways we interact with the world have undergone a transformative shift. Virtual Reality (VR) technology, aided by artificial intelligence (AI), has emerged as a dominant interaction media in multiple application areas, thanks to its advantage of providing users with immersive experiences. Among those applications, medicine is considered one of the most promising areas. In this paper, we present a comprehensive examination of the burgeoning field of AI-enhanced VR applications in medical care and services. By introducing a systematic taxonomy, we meticulously classify the pertinent techniques and applications into three well-defined categories based on different phases of medical diagnosis and treatment: Visualization Enhancement, VR-related Medical Data Processing, and VR-assisted Intervention. This categorization enables a structured exploration of the diverse roles that AI-powered VR plays in the medical domain, providing a framework for a more comprehensive understanding and evaluation of these technologies. To our best knowledge, this is the first systematic survey of AI-powered VR systems in medical settings, laying a foundation for future research in this interdisciplinary domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03093v3-abstract-full').style.display = 'none'; document.getElementById('2402.03093v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.12789">arXiv:2401.12789</a> <span> [<a href="https://arxiv.org/pdf/2401.12789">pdf</a>, <a href="https://arxiv.org/format/2401.12789">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multilingual and Fully Non-Autoregressive ASR with Large Language Model Fusion: A Comprehensive Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W+R">W. Ronny Huang</a>, <a href="/search/cs?searchtype=author&query=Allauzen%2C+C">Cyril Allauzen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tongzhou Chen</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+K">Kilol Gupta</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+J">James Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+S">Shuo-Yiin Chang</a>, <a href="/search/cs?searchtype=author&query=Sainath%2C+T+N">Tara N. Sainath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.12789v1-abstract-short" style="display: inline;"> In the era of large models, the autoregressive nature of decoding often results in latency serving as a significant bottleneck. We propose a non-autoregressive LM-fused ASR system that effectively leverages the parallelization capabilities of accelerator hardware. Our approach combines the Universal Speech Model (USM) and the PaLM 2 language model in per-segment scoring mode, achieving an average… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12789v1-abstract-full').style.display = 'inline'; document.getElementById('2401.12789v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.12789v1-abstract-full" style="display: none;"> In the era of large models, the autoregressive nature of decoding often results in latency serving as a significant bottleneck. We propose a non-autoregressive LM-fused ASR system that effectively leverages the parallelization capabilities of accelerator hardware. Our approach combines the Universal Speech Model (USM) and the PaLM 2 language model in per-segment scoring mode, achieving an average relative WER improvement across all languages of 10.8% on FLEURS and 3.6% on YouTube captioning. Furthermore, our comprehensive ablation study analyzes key parameters such as LLM size, context length, vocabulary size, fusion methodology. For instance, we explore the impact of LLM size ranging from 128M to 340B parameters on ASR performance. This study provides valuable insights into the factors influencing the effectiveness of practical large-scale LM-fused speech recognition systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12789v1-abstract-full').style.display = 'none'; document.getElementById('2401.12789v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.12433">arXiv:2401.12433</a> <span> [<a href="https://arxiv.org/pdf/2401.12433">pdf</a>, <a href="https://arxiv.org/format/2401.12433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Novel Garment Transfer Method Supervised by Distilled Knowledge of Virtual Try-on Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+N">Naiyu Fang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lemiao Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuyou Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zili Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kerui Hu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jianrong Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.12433v2-abstract-short" style="display: inline;"> This paper proposes a novel garment transfer method supervised with knowledge distillation from virtual try-on. Our method first reasons the transfer parsing to provide shape prior to downstream tasks. We employ a multi-phase teaching strategy to supervise the training of the transfer parsing reasoning model, learning the response and feature knowledge from the try-on parsing reasoning model. To c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12433v2-abstract-full').style.display = 'inline'; document.getElementById('2401.12433v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.12433v2-abstract-full" style="display: none;"> This paper proposes a novel garment transfer method supervised with knowledge distillation from virtual try-on. Our method first reasons the transfer parsing to provide shape prior to downstream tasks. We employ a multi-phase teaching strategy to supervise the training of the transfer parsing reasoning model, learning the response and feature knowledge from the try-on parsing reasoning model. To correct the teaching error, it transfers the garment back to its owner to absorb the hard knowledge in the self-study phase. Guided by the transfer parsing, we adjust the position of the transferred garment via STN to prevent distortion. Afterward, we estimate a progressive flow to precisely warp the garment with shape and content correspondences. To ensure warping rationality, we supervise the training of the garment warping model using target shape and warping knowledge from virtual try-on. To better preserve body features in the transfer result, we propose a well-designed training strategy for the arm regrowth task to infer new exposure skin. Experiments demonstrate that our method has state-of-the-art performance compared with other virtual try-on and garment transfer methods in garment transfer, especially for preserving garment texture and body features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12433v2-abstract-full').style.display = 'none'; document.getElementById('2401.12433v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.11874">arXiv:2401.11874</a> <span> [<a href="https://arxiv.org/pdf/2401.11874">pdf</a>, <a href="https://arxiv.org/format/2401.11874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Detect-Order-Construct: A Tree Construction based Approach for Hierarchical Document Structure Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiawei Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zhuoyao Zhong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lei Sun</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Q">Qiang Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.11874v2-abstract-short" style="display: inline;"> Document structure analysis (aka document layout analysis) is crucial for understanding the physical layout and logical structure of documents, with applications in information retrieval, document summarization, knowledge extraction, etc. In this paper, we concentrate on Hierarchical Document Structure Analysis (HDSA) to explore hierarchical relationships within structured documents created using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11874v2-abstract-full').style.display = 'inline'; document.getElementById('2401.11874v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.11874v2-abstract-full" style="display: none;"> Document structure analysis (aka document layout analysis) is crucial for understanding the physical layout and logical structure of documents, with applications in information retrieval, document summarization, knowledge extraction, etc. In this paper, we concentrate on Hierarchical Document Structure Analysis (HDSA) to explore hierarchical relationships within structured documents created using authoring software employing hierarchical schemas, such as LaTeX, Microsoft Word, and HTML. To comprehensively analyze hierarchical document structures, we propose a tree construction based approach that addresses multiple subtasks concurrently, including page object detection (Detect), reading order prediction of identified objects (Order), and the construction of intended hierarchical structure (Construct). We present an effective end-to-end solution based on this framework to demonstrate its performance. To assess our approach, we develop a comprehensive benchmark called Comp-HRDoc, which evaluates the above subtasks simultaneously. Our end-to-end system achieves state-of-the-art performance on two large-scale document layout analysis datasets (PubLayNet and DocLayNet), a high-quality hierarchical document structure reconstruction dataset (HRDoc), and our Comp-HRDoc benchmark. The Comp-HRDoc benchmark will be released to facilitate further research in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11874v2-abstract-full').style.display = 'none'; document.getElementById('2401.11874v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Pattern Recognition</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.09232">arXiv:2401.09232</a> <span> [<a href="https://arxiv.org/pdf/2401.09232">pdf</a>, <a href="https://arxiv.org/format/2401.09232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Relation Transformer for Contextual Text Block Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiawei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shunchi Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chixiang Ma</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zhuoyao Zhong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lei Sun</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Q">Qiang Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.09232v1-abstract-short" style="display: inline;"> Contextual Text Block Detection (CTBD) is the task of identifying coherent text blocks within the complexity of natural scenes. Previous methodologies have treated CTBD as either a visual relation extraction challenge within computer vision or as a sequence modeling problem from the perspective of natural language processing. We introduce a new framework that frames CTBD as a graph generation prob… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09232v1-abstract-full').style.display = 'inline'; document.getElementById('2401.09232v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.09232v1-abstract-full" style="display: none;"> Contextual Text Block Detection (CTBD) is the task of identifying coherent text blocks within the complexity of natural scenes. Previous methodologies have treated CTBD as either a visual relation extraction challenge within computer vision or as a sequence modeling problem from the perspective of natural language processing. We introduce a new framework that frames CTBD as a graph generation problem. This methodology consists of two essential procedures: identifying individual text units as graph nodes and discerning the sequential reading order relationships among these units as graph edges. Leveraging the cutting-edge capabilities of DQ-DETR for node detection, our framework innovates further by integrating a novel mechanism, a Dynamic Relation Transformer (DRFormer), dedicated to edge generation. DRFormer incorporates a dual interactive transformer decoder that deftly manages a dynamic graph structure refinement process. Through this iterative process, the model systematically enhances the graph's fidelity, ultimately resulting in improved precision in detecting contextual text blocks. Comprehensive experimental evaluations conducted on both SCUT-CTW-Context and ReCTS-Context datasets substantiate that our method achieves state-of-the-art results, underscoring the effectiveness and potential of our graph generation framework in advancing the field of CTBD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09232v1-abstract-full').style.display = 'none'; document.getElementById('2401.09232v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.09220">arXiv:2401.09220</a> <span> [<a href="https://arxiv.org/pdf/2401.09220">pdf</a>, <a href="https://arxiv.org/format/2401.09220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> UniVIE: A Unified Label Space Approach to Visual Information Extraction from Form-like Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiawei Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weihong Lin</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zhuoyao Zhong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lei Sun</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Q">Qiang Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.09220v1-abstract-short" style="display: inline;"> Existing methods for Visual Information Extraction (VIE) from form-like documents typically fragment the process into separate subtasks, such as key information extraction, key-value pair extraction, and choice group extraction. However, these approaches often overlook the hierarchical structure of form documents, including hierarchical key-value pairs and hierarchical choice groups. To address th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09220v1-abstract-full').style.display = 'inline'; document.getElementById('2401.09220v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.09220v1-abstract-full" style="display: none;"> Existing methods for Visual Information Extraction (VIE) from form-like documents typically fragment the process into separate subtasks, such as key information extraction, key-value pair extraction, and choice group extraction. However, these approaches often overlook the hierarchical structure of form documents, including hierarchical key-value pairs and hierarchical choice groups. To address these limitations, we present a new perspective, reframing VIE as a relation prediction problem and unifying labels of different tasks into a single label space. This unified approach allows for the definition of various relation types and effectively tackles hierarchical relationships in form-like documents. In line with this perspective, we present UniVIE, a unified model that addresses the VIE problem comprehensively. UniVIE functions using a coarse-to-fine strategy. It initially generates tree proposals through a tree proposal network, which are subsequently refined into hierarchical trees by a relation decoder module. To enhance the relation prediction capabilities of UniVIE, we incorporate two novel tree constraints into the relation decoder: a tree attention mask and a tree level embedding. Extensive experimental evaluations on both our in-house dataset HierForms and a publicly available dataset SIBR, substantiate that our method achieves state-of-the-art results, underscoring the effectiveness and potential of our unified approach in advancing the field of VIE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09220v1-abstract-full').style.display = 'none'; document.getElementById('2401.09220v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.07487">arXiv:2401.07487</a> <span> [<a href="https://arxiv.org/pdf/2401.07487">pdf</a>, <a href="https://arxiv.org/format/2401.07487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robo-ABC: Affordance Generalization Beyond Categories via Semantic Correspondence for Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ju%2C+Y">Yuanchen Ju</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kaizhe Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gu Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Mingrun Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Huazhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.07487v1-abstract-short" style="display: inline;"> Enabling robotic manipulation that generalizes to out-of-distribution scenes is a crucial step toward open-world embodied intelligence. For human beings, this ability is rooted in the understanding of semantic correspondence among objects, which naturally transfers the interaction experience of familiar objects to novel ones. Although robots lack such a reservoir of interaction experience, the vas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.07487v1-abstract-full').style.display = 'inline'; document.getElementById('2401.07487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.07487v1-abstract-full" style="display: none;"> Enabling robotic manipulation that generalizes to out-of-distribution scenes is a crucial step toward open-world embodied intelligence. For human beings, this ability is rooted in the understanding of semantic correspondence among objects, which naturally transfers the interaction experience of familiar objects to novel ones. Although robots lack such a reservoir of interaction experience, the vast availability of human videos on the Internet may serve as a valuable resource, from which we extract an affordance memory including the contact points. Inspired by the natural way humans think, we propose Robo-ABC: when confronted with unfamiliar objects that require generalization, the robot can acquire affordance by retrieving objects that share visual or semantic similarities from the affordance memory. The next step is to map the contact points of the retrieved objects to the new object. While establishing this correspondence may present formidable challenges at first glance, recent research finds it naturally arises from pre-trained diffusion models, enabling affordance mapping even across disparate object categories. Through the Robo-ABC framework, robots may generalize to manipulate out-of-category objects in a zero-shot manner without any manual annotation, additional training, part segmentation, pre-coded knowledge, or viewpoint restrictions. Quantitatively, Robo-ABC significantly enhances the accuracy of visual affordance retrieval by a large margin of 31.6% compared to state-of-the-art (SOTA) end-to-end affordance models. We also conduct real-world experiments of cross-category object-grasping tasks. Robo-ABC achieved a success rate of 85.7%, proving its capacity for real-world tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.07487v1-abstract-full').style.display = 'none'; document.getElementById('2401.07487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hu%2C+K&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hu%2C+K&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+K&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+K&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+K&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository