CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 252 results for author: <span class="mathjax">Wan, Z</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Wan%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wan, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wan%2C+Z&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wan, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17017">arXiv:2411.17017</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17017">pdf</a>, <a href="https://arxiv.org/format/2411.17017">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TED-VITON: Transformer-Empowered Diffusion Models for Virtual Try-On </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhenchen Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yanwu Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhaoqing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tongliang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+M">Mingming Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17017v1-abstract-short" style="display: inline;"> Recent advancements in Virtual Try-On (VTO) have demonstrated exceptional efficacy in generating realistic images and preserving garment details, largely attributed to the robust generative capabilities of text-to-image (T2I) diffusion backbones. However, the T2I models that underpin these methods have become outdated, thereby limiting the potential for further improvement in VTO. Additionally, cu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17017v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17017v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17017v1-abstract-full" style="display: none;"> Recent advancements in Virtual Try-On (VTO) have demonstrated exceptional efficacy in generating realistic images and preserving garment details, largely attributed to the robust generative capabilities of text-to-image (T2I) diffusion backbones. However, the T2I models that underpin these methods have become outdated, thereby limiting the potential for further improvement in VTO. Additionally, current methods face notable challenges in accurately rendering text on garments without distortion and preserving fine-grained details, such as textures and material fidelity. The emergence of Diffusion Transformer (DiT) based T2I models has showcased impressive performance and offers a promising opportunity for advancing VTO. Directly applying existing VTO techniques to transformer-based T2I models is ineffective due to substantial architectural differences, which hinder their ability to fully leverage the models&#39; advanced capabilities for improved text generation. To address these challenges and unlock the full potential of DiT-based T2I models for VTO, we propose TED-VITON, a novel framework that integrates a Garment Semantic (GS) Adapter for enhancing garment-specific features, a Text Preservation Loss to ensure accurate and distortion-free text rendering, and a constraint mechanism to generate prompts by optimizing Large Language Model (LLM). These innovations enable state-of-the-art (SOTA) performance in visual quality and text fidelity, establishing a new benchmark for VTO task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17017v1-abstract-full').style.display = 'none'; document.getElementById('2411.17017v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, 3 tables, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15595">arXiv:2411.15595</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15595">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An adversarial feature learning based semantic communication method for Human 3D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shaojiang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+J">Jiajun Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhendan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+M">Meixia Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhiping Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15595v1-abstract-short" style="display: inline;"> With the widespread application of human body 3D reconstruction technology across various fields, the demands for data transmission and processing efficiency continue to rise, particularly in scenarios where network bandwidth is limited and low latency is required. This paper introduces an Adversarial Feature Learning-based Semantic Communication method (AFLSC) for human body 3D reconstruction, wh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15595v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15595v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15595v1-abstract-full" style="display: none;"> With the widespread application of human body 3D reconstruction technology across various fields, the demands for data transmission and processing efficiency continue to rise, particularly in scenarios where network bandwidth is limited and low latency is required. This paper introduces an Adversarial Feature Learning-based Semantic Communication method (AFLSC) for human body 3D reconstruction, which focuses on extracting and transmitting semantic information crucial for the 3D reconstruction task, thereby significantly optimizing data flow and alleviating bandwidth pressure. At the sender&#39;s end, we propose a multitask learning-based feature extraction method to capture the spatial layout, keypoints, posture, and depth information from 2D human images, and design a semantic encoding technique based on adversarial feature learning to encode these feature information into semantic data. We also develop a dynamic compression technique to efficiently transmit this semantic data, greatly enhancing transmission efficiency and reducing latency. At the receiver&#39;s end, we design an efficient multi-level semantic feature decoding method to convert semantic data back into key image features. Finally, an improved ViT-diffusion model is employed for 3D reconstruction, producing human body 3D mesh models. Experimental results validate the advantages of our method in terms of data transmission efficiency and reconstruction quality, demonstrating its excellent potential for application in bandwidth-limited environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15595v1-abstract-full').style.display = 'none'; document.getElementById('2411.15595v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14251">arXiv:2411.14251</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14251">pdf</a>, <a href="https://arxiv.org/format/2411.14251">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Natural Language Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feng%2C+X">Xidong Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Ziyu Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+H">Haotian Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+M">Mengyue Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Koushik%2C+G+A">Girish A. Koushik</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Z">Zhiyuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Y">Ying Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14251v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) mathematically formulates decision-making with Markov Decision Process (MDP). With MDPs, researchers have achieved remarkable breakthroughs across various domains, including games, robotics, and language models. This paper seeks a new possibility, Natural Language Reinforcement Learning (NLRL), by extending traditional MDP to natural language-based representation space.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14251v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14251v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14251v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) mathematically formulates decision-making with Markov Decision Process (MDP). With MDPs, researchers have achieved remarkable breakthroughs across various domains, including games, robotics, and language models. This paper seeks a new possibility, Natural Language Reinforcement Learning (NLRL), by extending traditional MDP to natural language-based representation space. Specifically, NLRL innovatively redefines RL principles, including task objectives, policy, value function, Bellman equation, and policy iteration, into their language counterparts. With recent advancements in large language models (LLMs), NLRL can be practically implemented to achieve RL-like policy and value improvement by either pure prompting or gradient-based training. Experiments over Maze, Breakthrough, and Tic-Tac-Toe games demonstrate the effectiveness, efficiency, and interpretability of the NLRL framework among diverse use cases. Our code will be released at https://github.com/waterhorse1/Natural-language-RL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14251v1-abstract-full').style.display = 'none'; document.getElementById('2411.14251v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extension of arXiv:2402.07157</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06965">arXiv:2411.06965</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06965">pdf</a>, <a href="https://arxiv.org/format/2411.06965">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Imitation from Diverse Behaviors: Wasserstein Quality Diversity Imitation Learning with Single-Step Archive Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xingrui Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhenglin Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Bossens%2C+D+M">David Mark Bossens</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+Y">Yueming Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Q">Qing Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Tsang%2C+I+W">Ivor W. Tsang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06965v1-abstract-short" style="display: inline;"> Learning diverse and high-performance behaviors from a limited set of demonstrations is a grand challenge. Traditional imitation learning methods usually fail in this task because most of them are designed to learn one specific behavior even with multiple demonstrations. Therefore, novel techniques for quality diversity imitation learning are needed to solve the above challenge. This work introduc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06965v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06965v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06965v1-abstract-full" style="display: none;"> Learning diverse and high-performance behaviors from a limited set of demonstrations is a grand challenge. Traditional imitation learning methods usually fail in this task because most of them are designed to learn one specific behavior even with multiple demonstrations. Therefore, novel techniques for quality diversity imitation learning are needed to solve the above challenge. This work introduces Wasserstein Quality Diversity Imitation Learning (WQDIL), which 1) improves the stability of imitation learning in the quality diversity setting with latent adversarial training based on a Wasserstein Auto-Encoder (WAE), and 2) mitigates a behavior-overfitting issue using a measure-conditioned reward function with a single-step archive exploration bonus. Empirically, our method significantly outperforms state-of-the-art IL methods, achieving near-expert or beyond-expert QD performance on the challenging continuous control tasks derived from MuJoCo environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06965v1-abstract-full').style.display = 'none'; document.getElementById('2411.06965v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06469">arXiv:2411.06469</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06469">pdf</a>, <a href="https://arxiv.org/format/2411.06469">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ClinicalBench: Can LLMs Beat Traditional ML Models in Clinical Prediction? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Canyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jian Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Che Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Bitterman%2C+D">Danielle Bitterman</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shu%2C+K">Kai Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06469v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) hold great promise to revolutionize current clinical systems for their superior capacities on medical text processing tasks and medical licensing exams. Meanwhile, traditional ML models such as SVM and XGBoost have still been mainly adopted in clinical prediction tasks. An emerging question is Can LLMs beat traditional ML models in clinical prediction? Thus, we build a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06469v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06469v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06469v1-abstract-full" style="display: none;"> Large Language Models (LLMs) hold great promise to revolutionize current clinical systems for their superior capacities on medical text processing tasks and medical licensing exams. Meanwhile, traditional ML models such as SVM and XGBoost have still been mainly adopted in clinical prediction tasks. An emerging question is Can LLMs beat traditional ML models in clinical prediction? Thus, we build a new benchmark ClinicalBench to comprehensively study the clinical predictive modeling capacities of both general-purpose and medical LLMs, and compare them with traditional ML models. ClinicalBench embraces three common clinical prediction tasks, two databases, 14 general-purpose LLMs, 8 medical LLMs, and 11 traditional ML models. Through extensive empirical investigation, we discover that both general-purpose and medical LLMs, even with different model scales, diverse prompting or fine-tuning strategies, still cannot beat traditional ML models in clinical prediction yet, shedding light on their potential deficiency in clinical reasoning and decision-making. We call for caution when practitioners adopt LLMs in clinical applications. ClinicalBench can be utilized to bridge the gap between LLMs&#39; development for healthcare and real-world clinical practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06469v1-abstract-full').style.display = 'none'; document.getElementById('2411.06469v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contributed equally. 10 pages for main paper, 66 pages including appendix. Project website: https://clinicalbench.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05902">arXiv:2411.05902</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05902">pdf</a>, <a href="https://arxiv.org/format/2411.05902">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Models in Vision: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+J">Jing Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+G">Gongye Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+L">Lun Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Chengyue Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+T">Taiqiang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%2C+Y">Yao Mu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Jinfa Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+C">Chaofan Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+S">Shen Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+H">Huaxiu Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+L">Lingpeng Kong</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hongxia Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Mi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sapiro%2C+G">Guillermo Sapiro</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+P">Ping Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+N">Ngai Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05902v1-abstract-short" style="display: inline;"> Autoregressive modeling has been a huge success in the field of natural language processing (NLP). Recently, autoregressive models have emerged as a significant area of focus in computer vision, where they excel in producing high-quality visual content. Autoregressive models in NLP typically operate on subword tokens. However, the representation strategy in computer vision can vary in different le&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05902v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05902v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05902v1-abstract-full" style="display: none;"> Autoregressive modeling has been a huge success in the field of natural language processing (NLP). Recently, autoregressive models have emerged as a significant area of focus in computer vision, where they excel in producing high-quality visual content. Autoregressive models in NLP typically operate on subword tokens. However, the representation strategy in computer vision can vary in different levels, \textit{i.e.}, pixel-level, token-level, or scale-level, reflecting the diverse and hierarchical nature of visual data compared to the sequential structure of language. This survey comprehensively examines the literature on autoregressive models applied to vision. To improve readability for researchers from diverse research backgrounds, we start with preliminary sequence representation and modeling in vision. Next, we divide the fundamental frameworks of visual autoregressive models into three general sub-categories, including pixel-based, token-based, and scale-based models based on the strategy of representation. We then explore the interconnections between autoregressive models and other generative models. Furthermore, we present a multi-faceted categorization of autoregressive models in computer vision, including image generation, video generation, 3D generation, and multi-modal generation. We also elaborate on their applications in diverse domains, including emerging domains such as embodied AI and 3D medical AI, with about 250 related references. Finally, we highlight the current challenges to autoregressive models in vision with suggestions about potential research directions. We have also set up a Github repository to organize the papers included in this survey at: \url{https://github.com/ChaofanTao/Autoregressive-Models-in-Vision-Survey}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05902v1-abstract-full').style.display = 'none'; document.getElementById('2411.05902v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03628">arXiv:2411.03628</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03628">pdf</a>, <a href="https://arxiv.org/format/2411.03628">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StreamingBench: Assessing the Gap for MLLMs to Achieve Streaming Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+J">Junming Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Chi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zihao Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+F">Fuwen Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03628v1-abstract-short" style="display: inline;"> The rapid development of Multimodal Large Language Models (MLLMs) has expanded their capabilities from image comprehension to video understanding. However, most of these MLLMs focus primarily on offline video comprehension, necessitating extensive processing of all video frames before any queries can be made. This presents a significant gap compared to the human ability to watch, listen, think, an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03628v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03628v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03628v1-abstract-full" style="display: none;"> The rapid development of Multimodal Large Language Models (MLLMs) has expanded their capabilities from image comprehension to video understanding. However, most of these MLLMs focus primarily on offline video comprehension, necessitating extensive processing of all video frames before any queries can be made. This presents a significant gap compared to the human ability to watch, listen, think, and respond to streaming inputs in real time, highlighting the limitations of current MLLMs. In this paper, we introduce StreamingBench, the first comprehensive benchmark designed to evaluate the streaming video understanding capabilities of MLLMs. StreamingBench assesses three core aspects of streaming video understanding: (1) real-time visual understanding, (2) omni-source understanding, and (3) contextual understanding. The benchmark consists of 18 tasks, featuring 900 videos and 4,500 human-curated QA pairs. Each video features five questions presented at different time points to simulate a continuous streaming scenario. We conduct experiments on StreamingBench with 13 open-source and proprietary MLLMs and find that even the most advanced proprietary MLLMs like Gemini 1.5 Pro and GPT-4o perform significantly below human-level streaming video understanding capabilities. We hope our work can facilitate further advancements for MLLMs, empowering them to approach human-level video comprehension and interaction in more realistic scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03628v1-abstract-full').style.display = 'none'; document.getElementById('2411.03628v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20124">arXiv:2410.20124</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20124">pdf</a>, <a href="https://arxiv.org/format/2410.20124">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Breaking the Midas Spell:Understanding Progressive Novice-AI Collaboration in Spatial Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zijun Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jiawei Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+L">Linghang Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+X">Xin Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Can Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20124v1-abstract-short" style="display: inline;"> In spatial design, Artificial Intelligence (AI) tools often generate the entire spatial design outcome in a single automated step, rather than engaging users in a deepening and iterative process. This significantly reduces users&#39; involvement, learning, and creative capabilities, leading to a superficial understanding of spatial design. We conducted a Wizard-of-Oz study, where Novices and AI (acted&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20124v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20124v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20124v1-abstract-full" style="display: none;"> In spatial design, Artificial Intelligence (AI) tools often generate the entire spatial design outcome in a single automated step, rather than engaging users in a deepening and iterative process. This significantly reduces users&#39; involvement, learning, and creative capabilities, leading to a superficial understanding of spatial design. We conducted a Wizard-of-Oz study, where Novices and AI (acted by experimenters) worked together to finish spatial design tasks using various AI models. We identified typical function and workflow patterns adopted by the participants, leading to the understanding of the opportunities and challenges in the human-AI co-creation process. Based on insights gathered from this research, we proposed some design implications of the novice-AI collaboration system that aims to democratize spatial design through a progressive, iterative co-creation process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20124v1-abstract-full').style.display = 'none'; document.getElementById('2410.20124v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">draft submission to CHI 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.5.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19452">arXiv:2410.19452</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19452">pdf</a>, <a href="https://arxiv.org/format/2410.19452">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeuroClips: Towards High-fidelity and Smooth fMRI-to-Video Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gong%2C+Z">Zixuan Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+G">Guangyin Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Miao%2C+D">Duoqian Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shoujin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+L">Lei Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Changwei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+R">Rongtao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+L">Liang Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+K">Ke Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19452v2-abstract-short" style="display: inline;"> Reconstruction of static visual stimuli from non-invasion brain activity fMRI achieves great success, owning to advanced deep learning models such as CLIP and Stable Diffusion. However, the research on fMRI-to-video reconstruction remains limited since decoding the spatiotemporal perception of continuous visual experiences is formidably challenging. We contend that the key to addressing these chal&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19452v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19452v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19452v2-abstract-full" style="display: none;"> Reconstruction of static visual stimuli from non-invasion brain activity fMRI achieves great success, owning to advanced deep learning models such as CLIP and Stable Diffusion. However, the research on fMRI-to-video reconstruction remains limited since decoding the spatiotemporal perception of continuous visual experiences is formidably challenging. We contend that the key to addressing these challenges lies in accurately decoding both high-level semantics and low-level perception flows, as perceived by the brain in response to video stimuli. To the end, we propose NeuroClips, an innovative framework to decode high-fidelity and smooth video from fMRI. NeuroClips utilizes a semantics reconstructor to reconstruct video keyframes, guiding semantic accuracy and consistency, and employs a perception reconstructor to capture low-level perceptual details, ensuring video smoothness. During inference, it adopts a pre-trained T2V diffusion model injected with both keyframes and low-level perception flows for video reconstruction. Evaluated on a publicly available fMRI-video dataset, NeuroClips achieves smooth high-fidelity video reconstruction of up to 6s at 8FPS, gaining significant improvements over state-of-the-art models in various metrics, e.g., a 128% improvement in SSIM and an 81% improvement in spatiotemporal metrics. Our project is available at https://github.com/gongzix/NeuroClips. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19452v2-abstract-full').style.display = 'none'; document.getElementById('2410.19452v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15392">arXiv:2410.15392</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15392">pdf</a>, <a href="https://arxiv.org/format/2410.15392">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EF-3DGS: Event-Aided Free-Trajectory 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liao%2C+B">Bohao Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+W">Wei Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zengyu Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tianzhu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Zha%2C+Z">Zheng-Jun Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15392v2-abstract-short" style="display: inline;"> Scene reconstruction from casually captured videos has wide applications in real-world scenarios. With recent advancements in differentiable rendering techniques, several methods have attempted to simultaneously optimize scene representations (NeRF or 3DGS) and camera poses. Despite recent progress, existing methods relying on traditional camera input tend to fail in high-speed (or equivalently lo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15392v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15392v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15392v2-abstract-full" style="display: none;"> Scene reconstruction from casually captured videos has wide applications in real-world scenarios. With recent advancements in differentiable rendering techniques, several methods have attempted to simultaneously optimize scene representations (NeRF or 3DGS) and camera poses. Despite recent progress, existing methods relying on traditional camera input tend to fail in high-speed (or equivalently low-frame-rate) scenarios. Event cameras, inspired by biological vision, record pixel-wise intensity changes asynchronously with high temporal resolution, providing valuable scene and motion information in blind inter-frame intervals. In this paper, we introduce the event camera to aid scene construction from a casually captured video for the first time, and propose Event-Aided Free-Trajectory 3DGS, called EF-3DGS, which seamlessly integrates the advantages of event cameras into 3DGS through three key components. First, we leverage the Event Generation Model (EGM) to fuse events and frames, supervising the rendered views observed by the event stream. Second, we adopt the Contrast Maximization (CMax) framework in a piece-wise manner to extract motion information by maximizing the contrast of the Image of Warped Events (IWE), thereby calibrating the estimated poses. Besides, based on the Linear Event Generation Model (LEGM), the brightness information encoded in the IWE is also utilized to constrain the 3DGS in the gradient domain. Third, to mitigate the absence of color information of events, we introduce photometric bundle adjustment (PBA) to ensure view consistency across events and frames. We evaluate our method on the public Tanks and Temples benchmark and a newly collected real-world dataset, RealEv-DAVIS. Our project page is https://lbh666.github.io/ef-3dgs/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15392v2-abstract-full').style.display = 'none'; document.getElementById('2410.15392v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://lbh666.github.io/ef-3dgs/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13523">arXiv:2410.13523</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13523">pdf</a>, <a href="https://arxiv.org/format/2410.13523">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can Medical Vision-Language Pre-training Succeed with Purely Synthetic Data? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Che Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haozhe Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yinda Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Qaiser%2C+T">Talha Qaiser</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+C">Chen Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Yousefi%2C+F">Fariba Yousefi</a>, <a href="/search/cs?searchtype=author&amp;query=Burlutskiy%2C+N">Nikolay Burlutskiy</a>, <a href="/search/cs?searchtype=author&amp;query=Arcucci%2C+R">Rossella Arcucci</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13523v1-abstract-short" style="display: inline;"> Medical Vision-Language Pre-training (MedVLP) has made significant progress in enabling zero-shot tasks for medical image understanding. However, training MedVLP models typically requires large-scale datasets with paired, high-quality image-text data, which are scarce in the medical domain. Recent advancements in Large Language Models (LLMs) and diffusion models have made it possible to generate l&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13523v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13523v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13523v1-abstract-full" style="display: none;"> Medical Vision-Language Pre-training (MedVLP) has made significant progress in enabling zero-shot tasks for medical image understanding. However, training MedVLP models typically requires large-scale datasets with paired, high-quality image-text data, which are scarce in the medical domain. Recent advancements in Large Language Models (LLMs) and diffusion models have made it possible to generate large-scale synthetic image-text pairs. This raises the question: &#34;Can MedVLP succeed using purely synthetic data?&#34; To address this, we use off-the-shelf generative models to create synthetic radiology reports and paired Chest X-ray (CXR) images, and propose an automated pipeline to build a diverse, high-quality synthetic dataset, enabling a rigorous study that isolates model and training settings, focusing entirely from the data perspective. Our results show that MedVLP models trained exclusively on synthetic data outperform those trained on real data by 3.8% in averaged AUC on zero-shot classification. Moreover, using a combination of synthetic and real data leads to a further improvement of 9.07%. Additionally, MedVLP models trained on synthetic or mixed data consistently outperform those trained on real data in zero-shot grounding, as well as in fine-tuned classification and segmentation tasks. Our analysis suggests MedVLP trained on well-designed synthetic data can outperform models trained on real datasets, which may be limited by low-quality samples and long-tailed distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13523v1-abstract-full').style.display = 'none'; document.getElementById('2410.13523v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10751">arXiv:2410.10751</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10751">pdf</a>, <a href="https://arxiv.org/format/2410.10751">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DragEntity: Trajectory Guided Video Generation using Entity and Positional Relationships </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhang Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Sheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+J">Jiawei Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruize Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Juan Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10751v1-abstract-short" style="display: inline;"> In recent years, diffusion models have achieved tremendous success in the field of video generation, with controllable video generation receiving significant attention. However, existing control methods still face two limitations: Firstly, control conditions (such as depth maps, 3D Mesh) are difficult for ordinary users to obtain directly. Secondly, it&#39;s challenging to drive multiple objects throu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10751v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10751v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10751v1-abstract-full" style="display: none;"> In recent years, diffusion models have achieved tremendous success in the field of video generation, with controllable video generation receiving significant attention. However, existing control methods still face two limitations: Firstly, control conditions (such as depth maps, 3D Mesh) are difficult for ordinary users to obtain directly. Secondly, it&#39;s challenging to drive multiple objects through complex motions with multiple trajectories simultaneously. In this paper, we introduce DragEntity, a video generation model that utilizes entity representation for controlling the motion of multiple objects. Compared to previous methods, DragEntity offers two main advantages: 1) Our method is more user-friendly for interaction because it allows users to drag entities within the image rather than individual pixels. 2) We use entity representation to represent any object in the image, and multiple objects can maintain relative spatial relationships. Therefore, we allow multiple trajectories to control multiple objects in the image with different levels of complexity simultaneously. Our experiments validate the effectiveness of DragEntity, demonstrating its excellent performance in fine-grained control in video generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10751v1-abstract-full').style.display = 'none'; document.getElementById('2410.10751v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACM MM2024 Oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09671">arXiv:2410.09671</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09671">pdf</a>, <a href="https://arxiv.org/format/2410.09671">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OpenR: An Open Source Framework for Advanced Reasoning with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Ziyu Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+M">Muning Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jiachen Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A">Anjie Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+Z">Ziqin Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+L+M">Lionel M. Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Linyi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Y">Ying Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weinan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09671v1-abstract-short" style="display: inline;"> In this technical report, we introduce OpenR, an open-source framework designed to integrate key components for enhancing the reasoning capabilities of large language models (LLMs). OpenR unifies data acquisition, reinforcement learning training (both online and offline), and non-autoregressive decoding into a cohesive software platform. Our goal is to establish an open-source platform and communi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09671v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09671v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09671v1-abstract-full" style="display: none;"> In this technical report, we introduce OpenR, an open-source framework designed to integrate key components for enhancing the reasoning capabilities of large language models (LLMs). OpenR unifies data acquisition, reinforcement learning training (both online and offline), and non-autoregressive decoding into a cohesive software platform. Our goal is to establish an open-source platform and community to accelerate the development of LLM reasoning. Inspired by the success of OpenAI&#39;s o1 model, which demonstrated improved reasoning abilities through step-by-step reasoning and reinforcement learning, OpenR integrates test-time compute, reinforcement learning, and process supervision to improve reasoning in LLMs. Our work is the first to provide an open-source framework that explores the core techniques of OpenAI&#39;s o1 model with reinforcement learning, achieving advanced reasoning capabilities beyond traditional autoregressive methods. We demonstrate the efficacy of OpenR by evaluating it on the MATH dataset, utilising publicly available data and search methods. Our initial experiments confirm substantial gains, with relative improvements in reasoning and performance driven by test-time computation and reinforcement learning through process reward models. The OpenR framework, including code, models, and datasets, is accessible at https://openreasoner.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09671v1-abstract-full').style.display = 'none'; document.getElementById('2410.09671v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06782">arXiv:2410.06782</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06782">pdf</a>, <a href="https://arxiv.org/format/2410.06782">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Mind Your Questions! Towards Backdoor Attacks on Text-to-Visualization Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shuaimin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yuanfeng Song</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuanang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+A">Anni Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhuoyue Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C+J">Chen Jason Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+R+C">Raymond Chi-Wing Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06782v2-abstract-short" style="display: inline;"> Text-to-visualization (text-to-vis) models have become valuable tools in the era of big data, enabling users to generate data visualizations and make informed decisions through natural language queries (NLQs). Despite their widespread application, the security vulnerabilities of these models have been largely overlooked. To address this gap, we propose VisPoison, a novel framework designed to iden&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06782v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06782v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06782v2-abstract-full" style="display: none;"> Text-to-visualization (text-to-vis) models have become valuable tools in the era of big data, enabling users to generate data visualizations and make informed decisions through natural language queries (NLQs). Despite their widespread application, the security vulnerabilities of these models have been largely overlooked. To address this gap, we propose VisPoison, a novel framework designed to identify these vulnerabilities of current text-to-vis models systematically. VisPoison introduces two types of triggers that activate three distinct backdoor attacks, potentially leading to data exposure, misleading visualizations, or denial-of-service (DoS) incidents. The framework features both proactive and passive attack mechanisms: proactive attacks leverage rare-word triggers to access confidential data, while passive attacks, triggered unintentionally by users, exploit a first-word trigger method, causing errors or DoS events in visualizations. Through extensive experiments on both trainable and in-context learning (ICL)-based text-to-vis models, \textit{VisPoison} achieves attack success rates of over 90\%, highlighting the security problem of current text-to-vis models. Additionally, we explore two types of defense mechanisms against these attacks, but the results show that existing countermeasures are insufficient, underscoring the pressing need for more robust security solutions in text-to-vis systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06782v2-abstract-full').style.display = 'none'; document.getElementById('2410.06782v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06151">arXiv:2410.06151</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06151">pdf</a>, <a href="https://arxiv.org/format/2410.06151">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Quality Diversity Imitation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhenglin Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xingrui Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Bossens%2C+D+M">David Mark Bossens</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+Y">Yueming Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Q">Qing Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+F+X">Flint Xiaofeng Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Tsang%2C+I">Ivor Tsang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06151v1-abstract-short" style="display: inline;"> Imitation learning (IL) has shown great potential in various applications, such as robot control. However, traditional IL methods are usually designed to learn only one specific type of behavior since demonstrations typically correspond to a single expert. In this work, we introduce the first generic framework for Quality Diversity Imitation Learning (QD-IL), which enables the agent to learn a bro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06151v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06151v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06151v1-abstract-full" style="display: none;"> Imitation learning (IL) has shown great potential in various applications, such as robot control. However, traditional IL methods are usually designed to learn only one specific type of behavior since demonstrations typically correspond to a single expert. In this work, we introduce the first generic framework for Quality Diversity Imitation Learning (QD-IL), which enables the agent to learn a broad range of skills from limited demonstrations. Our framework integrates the principles of quality diversity with adversarial imitation learning (AIL) methods, and can potentially improve any inverse reinforcement learning (IRL) method. Empirically, our framework significantly improves the QD performance of GAIL and VAIL on the challenging continuous control tasks derived from Mujoco environments. Moreover, our method even achieves 2x expert performance in the most challenging Humanoid environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06151v1-abstract-full').style.display = 'none'; document.getElementById('2410.06151v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04081">arXiv:2410.04081</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04081">pdf</a>, <a href="https://arxiv.org/format/2410.04081">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> $蔚$-VAE: Denoising as Visual Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+L">Long Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Woo%2C+S">Sanghyun Woo</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Ziyu Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yandong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+B">Boqing Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Adam%2C+H">Hartwig Adam</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+X">Xuhui Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Ting Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04081v1-abstract-short" style="display: inline;"> In generative modeling, tokenization simplifies complex data into compact, structured representations, creating a more efficient, learnable space. For high-dimensional visual data, it reduces redundancy and emphasizes key features for high-quality generation. Current visual tokenization methods rely on a traditional autoencoder framework, where the encoder compresses data into latent representatio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04081v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04081v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04081v1-abstract-full" style="display: none;"> In generative modeling, tokenization simplifies complex data into compact, structured representations, creating a more efficient, learnable space. For high-dimensional visual data, it reduces redundancy and emphasizes key features for high-quality generation. Current visual tokenization methods rely on a traditional autoencoder framework, where the encoder compresses data into latent representations, and the decoder reconstructs the original input. In this work, we offer a new perspective by proposing denoising as decoding, shifting from single-step reconstruction to iterative refinement. Specifically, we replace the decoder with a diffusion process that iteratively refines noise to recover the original image, guided by the latents provided by the encoder. We evaluate our approach by assessing both reconstruction (rFID) and generation quality (FID), comparing it to state-of-the-art autoencoding approach. We hope this work offers new insights into integrating iterative generation and autoencoding for improved compression and generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04081v1-abstract-full').style.display = 'none'; document.getElementById('2410.04081v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03090">arXiv:2410.03090</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.03090">pdf</a>, <a href="https://arxiv.org/format/2410.03090">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> UNComp: Uncertainty-Aware Long-Context Compressor for Efficient Large Language Model Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+J">Jing Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Jianghan Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+F">Fanghua Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+C">Chaofan Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jianqiao Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xun Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+C">Chuanyang Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Zhijiang Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+L">Lingpeng Kong</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+N">Ngai Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03090v1-abstract-short" style="display: inline;"> Deploying large language models (LLMs) is challenging due to their high memory and computational demands, especially during long-context inference. While key-value (KV) caching accelerates inference by reusing previously computed keys and values, it also introduces significant memory overhead. Existing KV cache compression methods such as eviction and merging typically compress the KV cache after&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03090v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03090v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03090v1-abstract-full" style="display: none;"> Deploying large language models (LLMs) is challenging due to their high memory and computational demands, especially during long-context inference. While key-value (KV) caching accelerates inference by reusing previously computed keys and values, it also introduces significant memory overhead. Existing KV cache compression methods such as eviction and merging typically compress the KV cache after it is generated and overlook the eviction of hidden states, failing to improve the speed of the prefilling stage. Additionally, applying a uniform compression rate across different attention heads can harm crucial retrieval heads in needle-in-a-haystack tasks due to excessive compression. In this paper, we propose UNComp, an uncertainty-aware compression scheme that leverages matrix entropy to estimate model uncertainty across layers and heads at the token sequence level. By grouping layers and heads based on their uncertainty, UNComp adaptively compresses both the hidden states and the KV cache. Our method achieves a 1.6x speedup in the prefilling stage and reduces the KV cache to 4.74% of its original size, resulting in a 6.4x increase in throughput and a 1.4x speedup in inference with only a 1.41% performance loss. Remarkably, in needle-in-a-haystack tasks, UNComp outperforms the full-size KV cache even when compressed to 9.38% of its original size. Our approach offers an efficient, training-free Grouped-Query Attention paradigm that can be seamlessly integrated into existing KV cache schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03090v1-abstract-full').style.display = 'none'; document.getElementById('2410.03090v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02719">arXiv:2410.02719</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02719">pdf</a>, <a href="https://arxiv.org/format/2410.02719">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> UncertaintyRAG: Span-Level Uncertainty Enhanced Long-Context Modeling for Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zixuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+J">Jing Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+F">Fanghua Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+C">Chuanyang Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xun Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jianqiao Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xiaodan Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chengming Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhenan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+L">Lingpeng Kong</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+N">Ngai Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02719v1-abstract-short" style="display: inline;"> We present UncertaintyRAG, a novel approach for long-context Retrieval-Augmented Generation (RAG) that utilizes Signal-to-Noise Ratio (SNR)-based span uncertainty to estimate similarity between text chunks. This span uncertainty enhances model calibration, improving robustness and mitigating semantic inconsistencies introduced by random chunking. Leveraging this insight, we propose an efficient un&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02719v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02719v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02719v1-abstract-full" style="display: none;"> We present UncertaintyRAG, a novel approach for long-context Retrieval-Augmented Generation (RAG) that utilizes Signal-to-Noise Ratio (SNR)-based span uncertainty to estimate similarity between text chunks. This span uncertainty enhances model calibration, improving robustness and mitigating semantic inconsistencies introduced by random chunking. Leveraging this insight, we propose an efficient unsupervised learning technique to train the retrieval model, alongside an effective data sampling and scaling strategy. UncertaintyRAG outperforms baselines by 2.03% on LLaMA-2-7B, achieving state-of-the-art results while using only 4% of the training data compared to other advanced open-source retrieval models under distribution shift settings. Our method demonstrates strong calibration through span uncertainty, leading to improved generalization and robustness in long-context RAG tasks. Additionally, UncertaintyRAG provides a lightweight retrieval model that can be integrated into any large language model with varying context window lengths, without the need for fine-tuning, showcasing the flexibility of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02719v1-abstract-full').style.display = 'none'; document.getElementById('2410.02719v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01776">arXiv:2410.01776</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01776">pdf</a>, <a href="https://arxiv.org/format/2410.01776">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dynamical-generative downscaling of climate model ensembles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lopez-Gomez%2C+I">Ignacio Lopez-Gomez</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z+Y">Zhong Yi Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Zepeda-N%C3%BA%C3%B1ez%2C+L">Leonardo Zepeda-N煤帽ez</a>, <a href="/search/cs?searchtype=author&amp;query=Schneider%2C+T">Tapio Schneider</a>, <a href="/search/cs?searchtype=author&amp;query=Anderson%2C+J">John Anderson</a>, <a href="/search/cs?searchtype=author&amp;query=Sha%2C+F">Fei Sha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01776v1-abstract-short" style="display: inline;"> Regional high-resolution climate projections are crucial for many applications, such as agriculture, hydrology, and natural hazard risk assessment. Dynamical downscaling, the state-of-the-art method to produce localized future climate information, involves running a regional climate model (RCM) driven by an Earth System Model (ESM), but it is too computationally expensive to apply to large climate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01776v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01776v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01776v1-abstract-full" style="display: none;"> Regional high-resolution climate projections are crucial for many applications, such as agriculture, hydrology, and natural hazard risk assessment. Dynamical downscaling, the state-of-the-art method to produce localized future climate information, involves running a regional climate model (RCM) driven by an Earth System Model (ESM), but it is too computationally expensive to apply to large climate projection ensembles. We propose a novel approach combining dynamical downscaling with generative artificial intelligence to reduce the cost and improve the uncertainty estimates of downscaled climate projections. In our framework, an RCM dynamically downscales ESM output to an intermediate resolution, followed by a generative diffusion model that further refines the resolution to the target scale. This approach leverages the generalizability of physics-based models and the sampling efficiency of diffusion models, enabling the downscaling of large multi-model ensembles. We evaluate our method against dynamically-downscaled climate projections from the CMIP6 ensemble. Our results demonstrate its ability to provide more accurate uncertainty bounds on future regional climate than alternatives such as dynamical downscaling of smaller ensembles, or traditional empirical statistical downscaling methods. We also show that dynamical-generative downscaling results in significantly lower errors than bias correction and spatial disaggregation (BCSD), and captures more accurately the spectra and multivariate correlations of meteorological fields. These characteristics make the dynamical-generative framework a flexible, accurate, and efficient way to downscale large ensembles of climate projections, currently out of reach for pure dynamical downscaling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01776v1-abstract-full').style.display = 'none'; document.getElementById('2410.01776v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00360">arXiv:2410.00360</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.00360">pdf</a>, <a href="https://arxiv.org/format/2410.00360">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TFCT-I2P: Three stream fusion network with color aware transformer for image-to-point cloud registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+M">Muyao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=An%2C+P">Pei An</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zichen Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">You Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qiong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00360v1-abstract-short" style="display: inline;"> Along with the advancements in artificial intelligence technologies, image-to-point-cloud registration (I2P) techniques have made significant strides. Nevertheless, the dimensional differences in the features of points cloud (three-dimension) and image (two-dimension) continue to pose considerable challenges to their development. The primary challenge resides in the inability to leverage the featu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00360v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00360v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00360v1-abstract-full" style="display: none;"> Along with the advancements in artificial intelligence technologies, image-to-point-cloud registration (I2P) techniques have made significant strides. Nevertheless, the dimensional differences in the features of points cloud (three-dimension) and image (two-dimension) continue to pose considerable challenges to their development. The primary challenge resides in the inability to leverage the features of one modality to augment those of another, thereby complicating the alignment of features within the latent space. To address this challenge, we propose an image-to-point-cloud method named as TFCT-I2P. Initially, we introduce a Three-Stream Fusion Network (TFN), which integrates color information from images with structural information from point clouds, facilitating the alignment of features from both modalities. Subsequently, to effectively mitigate patch-level misalignments introduced by the inclusion of color information, we design a Color-Aware Transformer (CAT). Finally, we conduct extensive experiments on 7Scenes, RGB-D Scenes V2, ScanNet V2, and a self-collected dataset. The results demonstrate that TFCT-I2P surpasses state-of-the-art methods by 1.5% in Inlier Ratio, 0.4% in Feature Matching Recall, and 5.4% in Registration Recall. Therefore, we believe that the proposed TFCT-I2P contributes to the advancement of I2P registration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00360v1-abstract-full').style.display = 'none'; document.getElementById('2410.00360v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20332">arXiv:2409.20332</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.20332">pdf</a>, <a href="https://arxiv.org/format/2409.20332">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Devil is in Details: Locality-Aware 3D Abdominal CT Volume Generation for Self-Supervised Organ Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuran Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhijing Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+Y">Yansheng Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20332v1-abstract-short" style="display: inline;"> In the realm of medical image analysis, self-supervised learning (SSL) techniques have emerged to alleviate labeling demands, while still facing the challenge of training data scarcity owing to escalating resource requirements and privacy constraints. Numerous efforts employ generative models to generate high-fidelity, unlabeled 3D volumes across diverse modalities and anatomical regions. However,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20332v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20332v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20332v1-abstract-full" style="display: none;"> In the realm of medical image analysis, self-supervised learning (SSL) techniques have emerged to alleviate labeling demands, while still facing the challenge of training data scarcity owing to escalating resource requirements and privacy constraints. Numerous efforts employ generative models to generate high-fidelity, unlabeled 3D volumes across diverse modalities and anatomical regions. However, the intricate and indistinguishable anatomical structures within the abdomen pose a unique challenge to abdominal CT volume generation compared to other anatomical regions. To address the overlooked challenge, we introduce the Locality-Aware Diffusion (Lad), a novel method tailored for exquisite 3D abdominal CT volume generation. We design a locality loss to refine crucial anatomical regions and devise a condition extractor to integrate abdominal priori into generation, thereby enabling the generation of large quantities of high-quality abdominal CT volumes essential for SSL tasks without the need for additional data such as labels or radiology reports. Volumes generated through our method demonstrate remarkable fidelity in reproducing abdominal structures, achieving a decrease in FID score from 0.0034 to 0.0002 on AbdomenCT-1K dataset, closely mirroring authentic data and surpassing current methods. Extensive experiments demonstrate the effectiveness of our method in self-supervised organ segmentation tasks, resulting in an improvement in mean Dice scores on two abdominal datasets effectively. These results underscore the potential of synthetic data to advance self-supervised learning in medical image analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20332v1-abstract-full').style.display = 'none'; document.getElementById('2409.20332v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19892">arXiv:2409.19892</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.19892">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> VAP: The Vulnerability-Adaptive Protection Paradigm Toward Reliable Autonomous Machines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zishen Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Gan%2C+Y">Yiming Gan</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bo Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shaoshan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Raychowdhury%2C+A">Arijit Raychowdhury</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yuhao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19892v1-abstract-short" style="display: inline;"> The next ubiquitous computing platform, following personal computers and smartphones, is poised to be inherently autonomous, encompassing technologies like drones, robots, and self-driving cars. Ensuring reliability for these autonomous machines is critical. However, current resiliency solutions make fundamental trade-offs between reliability and cost, resulting in significant overhead in performa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19892v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19892v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19892v1-abstract-full" style="display: none;"> The next ubiquitous computing platform, following personal computers and smartphones, is poised to be inherently autonomous, encompassing technologies like drones, robots, and self-driving cars. Ensuring reliability for these autonomous machines is critical. However, current resiliency solutions make fundamental trade-offs between reliability and cost, resulting in significant overhead in performance, energy consumption, and chip area. This is due to the &#34;one-size-fits-all&#34; approach commonly used, where the same protection scheme is applied throughout the entire software computing stack. This paper presents the key insight that to achieve high protection coverage with minimal cost, we must leverage the inherent variations in robustness across different layers of the autonomous machine software stack. Specifically, we demonstrate that various nodes in this complex stack exhibit different levels of robustness against hardware faults. Our findings reveal that the front-end of an autonomous machine&#39;s software stack tends to be more robust, whereas the back-end is generally more vulnerable. Building on these inherent robustness differences, we propose a Vulnerability-Adaptive Protection (VAP) design paradigm. In this paradigm, the allocation of protection resources - whether spatially (e.g., through modular redundancy) or temporally (e.g., via re-execution) - is made inversely proportional to the inherent robustness of tasks or algorithms within the autonomous machine system. Experimental results show that VAP provides high protection coverage while maintaining low overhead in both autonomous vehicle and drone systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19892v1-abstract-full').style.display = 'none'; document.getElementById('2409.19892v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Communications of the ACM (CACM), Research and Advances, Vol 67, No.9, September 2024. ACM Link: https://dl.acm.org/doi/pdf/10.1145/3647638</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18359">arXiv:2409.18359</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.18359">pdf</a>, <a href="https://arxiv.org/format/2409.18359">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> </div> </div> <p class="title is-5 mathjax"> Generative AI for fast and accurate Statistical Computation of Fluids </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Molinaro%2C+R">Roberto Molinaro</a>, <a href="/search/cs?searchtype=author&amp;query=Lanthaler%2C+S">Samuel Lanthaler</a>, <a href="/search/cs?searchtype=author&amp;query=Raoni%C4%87%2C+B">Bogdan Raoni膰</a>, <a href="/search/cs?searchtype=author&amp;query=Rohner%2C+T">Tobias Rohner</a>, <a href="/search/cs?searchtype=author&amp;query=Armegioiu%2C+V">Victor Armegioiu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z+Y">Zhong Yi Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Sha%2C+F">Fei Sha</a>, <a href="/search/cs?searchtype=author&amp;query=Mishra%2C+S">Siddhartha Mishra</a>, <a href="/search/cs?searchtype=author&amp;query=Zepeda-N%C3%BA%C3%B1ez%2C+L">Leonardo Zepeda-N煤帽ez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18359v1-abstract-short" style="display: inline;"> We present a generative AI algorithm for addressing the challenging task of fast, accurate and robust statistical computation of three-dimensional turbulent fluid flows. Our algorithm, termed as GenCFD, is based on a conditional score-based diffusion model. Through extensive numerical experimentation with both incompressible and compressible fluid flows, we demonstrate that GenCFD provides very ac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18359v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18359v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18359v1-abstract-full" style="display: none;"> We present a generative AI algorithm for addressing the challenging task of fast, accurate and robust statistical computation of three-dimensional turbulent fluid flows. Our algorithm, termed as GenCFD, is based on a conditional score-based diffusion model. Through extensive numerical experimentation with both incompressible and compressible fluid flows, we demonstrate that GenCFD provides very accurate approximation of statistical quantities of interest such as mean, variance, point pdfs, higher-order moments, while also generating high quality realistic samples of turbulent fluid flows and ensuring excellent spectral resolution. In contrast, ensembles of operator learning baselines which are trained to minimize mean (absolute) square errors regress to the mean flow. We present rigorous theoretical results uncovering the surprising mechanisms through which diffusion models accurately generate fluid flows. These mechanisms are illustrated with solvable toy models that exhibit the relevant features of turbulent fluid flows while being amenable to explicit analytical formulas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18359v1-abstract-full').style.display = 'none'; document.getElementById('2409.18359v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">71 pages, 30 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17424">arXiv:2409.17424</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.17424">pdf</a>, <a href="https://arxiv.org/format/2409.17424">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Results of the Big ANN: NeurIPS&#39;23 competition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Simhadri%2C+H+V">Harsha Vardhan Simhadri</a>, <a href="/search/cs?searchtype=author&amp;query=Aum%C3%BCller%2C+M">Martin Aum眉ller</a>, <a href="/search/cs?searchtype=author&amp;query=Ingber%2C+A">Amir Ingber</a>, <a href="/search/cs?searchtype=author&amp;query=Douze%2C+M">Matthijs Douze</a>, <a href="/search/cs?searchtype=author&amp;query=Williams%2C+G">George Williams</a>, <a href="/search/cs?searchtype=author&amp;query=Manohar%2C+M+D">Magdalen Dobson Manohar</a>, <a href="/search/cs?searchtype=author&amp;query=Baranchuk%2C+D">Dmitry Baranchuk</a>, <a href="/search/cs?searchtype=author&amp;query=Liberty%2C+E">Edo Liberty</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Frank Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Landrum%2C+B">Ben Landrum</a>, <a href="/search/cs?searchtype=author&amp;query=Karjikar%2C+M">Mazin Karjikar</a>, <a href="/search/cs?searchtype=author&amp;query=Dhulipala%2C+L">Laxman Dhulipala</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+M">Meng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yue Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+R">Rui Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Y">Yuzheng Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+J">Jiayang Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yizhuo Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+W">Weiguo Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zihao Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+J">Jie Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+B">Ben Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17424v1-abstract-short" style="display: inline;"> The 2023 Big ANN Challenge, held at NeurIPS 2023, focused on advancing the state-of-the-art in indexing data structures and search algorithms for practical variants of Approximate Nearest Neighbor (ANN) search that reflect the growing complexity and diversity of workloads. Unlike prior challenges that emphasized scaling up classical ANN search ~\cite{DBLP:conf/nips/SimhadriWADBBCH21}, this competi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17424v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17424v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17424v1-abstract-full" style="display: none;"> The 2023 Big ANN Challenge, held at NeurIPS 2023, focused on advancing the state-of-the-art in indexing data structures and search algorithms for practical variants of Approximate Nearest Neighbor (ANN) search that reflect the growing complexity and diversity of workloads. Unlike prior challenges that emphasized scaling up classical ANN search ~\cite{DBLP:conf/nips/SimhadriWADBBCH21}, this competition addressed filtered search, out-of-distribution data, sparse and streaming variants of ANNS. Participants developed and submitted innovative solutions that were evaluated on new standard datasets with constrained computational resources. The results showcased significant improvements in search accuracy and efficiency over industry-standard baselines, with notable contributions from both academic and industrial teams. This paper summarizes the competition tracks, datasets, evaluation metrics, and the innovative approaches of the top-performing submissions, providing insights into the current advancements and future directions in the field of approximate nearest neighbor search. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17424v1-abstract-full').style.display = 'none'; document.getElementById('2409.17424v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/harsha-simhadri/big-ann-benchmarks/releases/tag/v0.3.0</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.3.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14972">arXiv:2409.14972</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.14972">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Reinforcement Learning-based Obstacle Avoidance for Robot Movement in Warehouse Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Keqin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiajing Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+D">Denzhi Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Dajun%2C+T">Tao Dajun</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+X">Xinyu Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Jieting%2C+L">Lian Jieting</a>, <a href="/search/cs?searchtype=author&amp;query=Baiwei%2C+S">Sun Baiwei</a>, <a href="/search/cs?searchtype=author&amp;query=Shengyuan%2C+Z">Zhang Shengyuan</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhenyu Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+R">Ran Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+B">Bo Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+F">Fanghao Ni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14972v1-abstract-short" style="display: inline;"> At present, in most warehouse environments, the accumulation of goods is complex, and the management personnel in the control of goods at the same time with the warehouse mobile robot trajectory interaction, the traditional mobile robot can not be very good on the goods and pedestrians to feed back the correct obstacle avoidance strategy, in order to control the mobile robot in the warehouse envir&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14972v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14972v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14972v1-abstract-full" style="display: none;"> At present, in most warehouse environments, the accumulation of goods is complex, and the management personnel in the control of goods at the same time with the warehouse mobile robot trajectory interaction, the traditional mobile robot can not be very good on the goods and pedestrians to feed back the correct obstacle avoidance strategy, in order to control the mobile robot in the warehouse environment efficiently and friendly to complete the obstacle avoidance task, this paper proposes a deep reinforcement learning based on the warehouse environment, the mobile robot obstacle avoidance Algorithm. Firstly, for the insufficient learning ability of the value function network in the deep reinforcement learning algorithm, the value function network is improved based on the pedestrian interaction, the interaction information between pedestrians is extracted through the pedestrian angle grid, and the temporal features of individual pedestrians are extracted through the attention mechanism, so that we can learn to obtain the relative importance of the current state and the historical trajectory state as well as the joint impact on the robot&#39;s obstacle avoidance strategy, which provides an opportunity for the learning of multi-layer perceptual machines afterwards. Secondly, the reward function of reinforcement learning is designed based on the spatial behaviour of pedestrians, and the robot is punished for the state where the angle changes too much, so as to achieve the requirement of comfortable obstacle avoidance; Finally, the feasibility and effectiveness of the deep reinforcement learning-based mobile robot obstacle avoidance algorithm in the warehouse environment in the complex environment of the warehouse are verified through simulation experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14972v1-abstract-full').style.display = 'none'; document.getElementById('2409.14972v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13194">arXiv:2409.13194</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13194">pdf</a>, <a href="https://arxiv.org/format/2409.13194">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> ChemDFM-X: Towards Large Multimodal Model for Chemistry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zihan Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jingpiao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+L">Liyang Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Pengyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zichen Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Danyang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Ziping Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yansi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Z">Zhongyang Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+K">Kai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13194v1-abstract-short" style="display: inline;"> Rapid developments of AI tools are expected to offer unprecedented assistance to the research of natural science including chemistry. However, neither existing unimodal task-specific specialist models nor emerging general large multimodal models (LMM) can cover the wide range of chemical data modality and task categories. To address the real demands of chemists, a cross-modal Chemical General Inte&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13194v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13194v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13194v1-abstract-full" style="display: none;"> Rapid developments of AI tools are expected to offer unprecedented assistance to the research of natural science including chemistry. However, neither existing unimodal task-specific specialist models nor emerging general large multimodal models (LMM) can cover the wide range of chemical data modality and task categories. To address the real demands of chemists, a cross-modal Chemical General Intelligence (CGI) system, which serves as a truly practical and useful research assistant utilizing the great potential of LMMs, is in great need. In this work, we introduce the first Cross-modal Dialogue Foundation Model for Chemistry (ChemDFM-X). Diverse multimodal data are generated from an initial modality by approximate calculations and task-specific model predictions. This strategy creates sufficient chemical training corpora, while significantly reducing excessive expense, resulting in an instruction-tuning dataset containing 7.6M data. After instruction finetuning, ChemDFM-X is evaluated on extensive experiments of different chemical tasks with various data modalities. The results demonstrate the capacity of ChemDFM-X for multimodal and inter-modal knowledge comprehension. ChemDFM-X marks a significant milestone toward aligning all modalities in chemistry, a step closer to CGI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13194v1-abstract-full').style.display = 'none'; document.getElementById('2409.13194v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 7 figures, 11 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13153">arXiv:2409.13153</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13153">pdf</a>, <a href="https://arxiv.org/format/2409.13153">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Efficient Neuro-Symbolic AI: From Workload Characterization to Hardware Architecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zishen Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Che-Kai Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hanchen Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Raj%2C+R">Ritik Raj</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chaojian Li</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+H">Haoran You</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Y">Yonggan Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+C">Cheng Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Sixu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+Y">Youbin Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Samajdar%2C+A">Ananda Samajdar</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y+C">Yingyan Celine Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Ibrahim%2C+M">Mohamed Ibrahim</a>, <a href="/search/cs?searchtype=author&amp;query=Rabaey%2C+J+M">Jan M. Rabaey</a>, <a href="/search/cs?searchtype=author&amp;query=Krishna%2C+T">Tushar Krishna</a>, <a href="/search/cs?searchtype=author&amp;query=Raychowdhury%2C+A">Arijit Raychowdhury</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13153v2-abstract-short" style="display: inline;"> The remarkable advancements in artificial intelligence (AI), primarily driven by deep neural networks, are facing challenges surrounding unsustainable computational trajectories, limited robustness, and a lack of explainability. To develop next-generation cognitive AI systems, neuro-symbolic AI emerges as a promising paradigm, fusing neural and symbolic approaches to enhance interpretability, robu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13153v2-abstract-full').style.display = 'inline'; document.getElementById('2409.13153v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13153v2-abstract-full" style="display: none;"> The remarkable advancements in artificial intelligence (AI), primarily driven by deep neural networks, are facing challenges surrounding unsustainable computational trajectories, limited robustness, and a lack of explainability. To develop next-generation cognitive AI systems, neuro-symbolic AI emerges as a promising paradigm, fusing neural and symbolic approaches to enhance interpretability, robustness, and trustworthiness, while facilitating learning from much less data. Recent neuro-symbolic systems have demonstrated great potential in collaborative human-AI scenarios with reasoning and cognitive capabilities. In this paper, we aim to understand the workload characteristics and potential architectures for neuro-symbolic AI. We first systematically categorize neuro-symbolic AI algorithms, and then experimentally evaluate and analyze them in terms of runtime, memory, computational operators, sparsity, and system characteristics on CPUs, GPUs, and edge SoCs. Our studies reveal that neuro-symbolic models suffer from inefficiencies on off-the-shelf hardware, due to the memory-bound nature of vector-symbolic and logical operations, complex flow control, data dependencies, sparsity variations, and limited scalability. Based on profiling insights, we suggest cross-layer optimization solutions and present a hardware acceleration case study for vector-symbolic architecture to improve the performance, efficiency, and scalability of neuro-symbolic computing. Finally, we discuss the challenges and potential future directions of neuro-symbolic AI from both system and architectural perspectives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13153v2-abstract-full').style.display = 'none'; document.getElementById('2409.13153v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 11 figures, 7 tables; IEEE Transactions on Circuits and Systems for Artificial Intelligence (TCASAI), 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09808">arXiv:2409.09808</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.09808">pdf</a>, <a href="https://arxiv.org/format/2409.09808">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Famba-V: Fast Vision Mamba with Cross-Layer Token Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Mi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09808v3-abstract-short" style="display: inline;"> Mamba and Vision Mamba (Vim) models have shown their potential as an alternative to methods based on Transformer architecture. This work introduces Fast Mamba for Vision (Famba-V), a cross-layer token fusion technique to enhance the training efficiency of Vim models. The key idea of Famba-V is to identify and fuse similar tokens across different Vim layers based on a suit of cross-layer strategies&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09808v3-abstract-full').style.display = 'inline'; document.getElementById('2409.09808v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09808v3-abstract-full" style="display: none;"> Mamba and Vision Mamba (Vim) models have shown their potential as an alternative to methods based on Transformer architecture. This work introduces Fast Mamba for Vision (Famba-V), a cross-layer token fusion technique to enhance the training efficiency of Vim models. The key idea of Famba-V is to identify and fuse similar tokens across different Vim layers based on a suit of cross-layer strategies instead of simply applying token fusion uniformly across all the layers that existing works propose. We evaluate the performance of Famba-V on CIFAR-100. Our results show that Famba-V is able to enhance the training efficiency of Vim models by reducing both training time and peak memory usage during training. Moreover, the proposed cross-layer strategies allow Famba-V to deliver superior accuracy-efficiency trade-offs. These results all together demonstrate Famba-V as a promising efficiency enhancement technique for Vim models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09808v3-abstract-full').style.display = 'none'; document.getElementById('2409.09808v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Camera ready version of ECCV 2024 Workshop on Computational Aspects of Deep Learning (Best Paper Award)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01341">arXiv:2409.01341</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.01341">pdf</a>, <a href="https://arxiv.org/format/2409.01341">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Test Time Adaptation with Few-shot Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+S">Siqi Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Xin%2C+Y">Yi Xin</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+Y">Yuntao Du</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+T">Tao Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+G">Guangtao Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiaohong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01341v1-abstract-short" style="display: inline;"> Deep neural networks often encounter significant performance drops while facing with domain shifts between training (source) and test (target) data. To address this issue, Test Time Adaptation (TTA) methods have been proposed to adapt pre-trained source model to handle out-of-distribution streaming target data. Although these methods offer some relief, they lack a reliable mechanism for domain shi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01341v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01341v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01341v1-abstract-full" style="display: none;"> Deep neural networks often encounter significant performance drops while facing with domain shifts between training (source) and test (target) data. To address this issue, Test Time Adaptation (TTA) methods have been proposed to adapt pre-trained source model to handle out-of-distribution streaming target data. Although these methods offer some relief, they lack a reliable mechanism for domain shift correction, which can often be erratic in real-world applications. In response, we develop Few-Shot Test Time Adaptation (FS-TTA), a novel and practical setting that utilizes a few-shot support set on top of TTA. Adhering to the principle of few inputs, big gains, FS-TTA reduces blind exploration in unseen target domains. Furthermore, we propose a two-stage framework to tackle FS-TTA, including (i) fine-tuning the pre-trained source model with few-shot support set, along with using feature diversity augmentation module to avoid overfitting, (ii) implementing test time adaptation based on prototype memory bank guidance to produce high quality pseudo-label for model adaptation. Through extensive experiments on three cross-domain classification benchmarks, we demonstrate the superior performance and reliability of our FS-TTA and framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01341v1-abstract-full').style.display = 'none'; document.getElementById('2409.01341v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12984">arXiv:2408.12984</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.12984">pdf</a>, <a href="https://arxiv.org/format/2408.12984">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PDDFormer: Pairwise Distance Distribution Graph Transformer for Crystal Material Property Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shen%2C+X">Xiangxiang Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zheng Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+L">Lingfeng Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Licheng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Jie%2C+O+Y+M">Ou Yang Ming Jie</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+J">JiJUn Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+X">Xuan Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+X">Xian Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12984v4-abstract-short" style="display: inline;"> The crystal structure can be simplified as a periodic point set repeating across the entire three-dimensional space along an underlying lattice. Traditionally, methods for representing crystals rely on descriptors like lattice parameters, symmetry, and space groups to characterize the structure. However, in reality, atoms in material always vibrate above absolute zero, causing continuous fluctuati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12984v4-abstract-full').style.display = 'inline'; document.getElementById('2408.12984v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12984v4-abstract-full" style="display: none;"> The crystal structure can be simplified as a periodic point set repeating across the entire three-dimensional space along an underlying lattice. Traditionally, methods for representing crystals rely on descriptors like lattice parameters, symmetry, and space groups to characterize the structure. However, in reality, atoms in material always vibrate above absolute zero, causing continuous fluctuations in their positions. This dynamic behavior disrupts the underlying periodicity of the lattice, making crystal graphs based on static lattice parameters and conventional descriptors discontinuous under even slight perturbations. To this end, chemists proposed the Pairwise Distance Distribution (PDD) method, which has been used to distinguish all periodic structures in the world&#39;s largest real materials collection, the Cambridge Structural Database. However, achieving the completeness of PDD requires defining a large number of neighboring atoms, resulting in high computational costs. Moreover, it does not account for atomic information, making it challenging to directly apply PDD to crystal material property prediction tasks. To address these challenges, we propose the atom-Weighted Pairwise Distance Distribution (WPDD) and Unit cell Pairwise Distance Distribution (UPDD) for the first time, incorporating them into the construction of multi-edge crystal graphs. Based on this, we further developed WPDDFormer and UPDDFormer, graph transformer architecture constructed using WPDD and UPDD crystal graphs. We demonstrate that this method maintains the continuity and completeness of crystal graphs even under slight perturbations in atomic positions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12984v4-abstract-full').style.display = 'none'; document.getElementById('2408.12984v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11523">arXiv:2408.11523</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.11523">pdf</a>, <a href="https://arxiv.org/format/2408.11523">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3640457.3688135">10.1145/3640457.3688135 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> LARR: Large Language Model Aided Real-time Scene Recommendation with Semantic Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhizhong Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+B">Bin Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+J">Junjie Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+F">Fei Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+W">Wei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11523v1-abstract-short" style="display: inline;"> Click-Through Rate (CTR) prediction is crucial for Recommendation System(RS), aiming to provide personalized recommendation services for users in many aspects such as food delivery, e-commerce and so on. However, traditional RS relies on collaborative signals, which lacks semantic understanding to real-time scenes. We also noticed that a major challenge in utilizing Large Language Models (LLMs) fo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11523v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11523v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11523v1-abstract-full" style="display: none;"> Click-Through Rate (CTR) prediction is crucial for Recommendation System(RS), aiming to provide personalized recommendation services for users in many aspects such as food delivery, e-commerce and so on. However, traditional RS relies on collaborative signals, which lacks semantic understanding to real-time scenes. We also noticed that a major challenge in utilizing Large Language Models (LLMs) for practical recommendation purposes is their efficiency in dealing with long text input. To break through the problems above, we propose Large Language Model Aided Real-time Scene Recommendation(LARR), adopt LLMs for semantic understanding, utilizing real-time scene information in RS without requiring LLM to process the entire real-time scene text directly, thereby enhancing the efficiency of LLM-based CTR modeling. Specifically, recommendation domain-specific knowledge is injected into LLM and then RS employs an aggregation encoder to build real-time scene information from separate LLM&#39;s outputs. Firstly, a LLM is continual pretrained on corpus built from recommendation data with the aid of special tokens. Subsequently, the LLM is fine-tuned via contrastive learning on three kinds of sample construction strategies. Through this step, LLM is transformed into a text embedding model. Finally, LLM&#39;s separate outputs for different scene features are aggregated by an encoder, aligning to collaborative signals in RS, enhancing the performance of recommendation model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11523v1-abstract-full').style.display = 'none'; document.getElementById('2408.11523v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10811">arXiv:2408.10811</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10811">pdf</a>, <a href="https://arxiv.org/format/2408.10811">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Beyond English-Centric LLMs: What Language Do Multilingual Language Models Think in? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+C">Chengzhi Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+F">Fei Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qianying Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+J">Junfeng Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhen Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+C">Chenhui Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Murawaki%2C+Y">Yugo Murawaki</a>, <a href="/search/cs?searchtype=author&amp;query=Kurohashi%2C+S">Sadao Kurohashi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10811v1-abstract-short" style="display: inline;"> In this study, we investigate whether non-English-centric LLMs, despite their strong performance, `think&#39; in their respective dominant language: more precisely, `think&#39; refers to how the representations of intermediate layers, when un-embedded into the vocabulary space, exhibit higher probabilities for certain dominant languages during generation. We term such languages as internal&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10811v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10811v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10811v1-abstract-full" style="display: none;"> In this study, we investigate whether non-English-centric LLMs, despite their strong performance, `think&#39; in their respective dominant language: more precisely, `think&#39; refers to how the representations of intermediate layers, when un-embedded into the vocabulary space, exhibit higher probabilities for certain dominant languages during generation. We term such languages as internal $\textbf{latent languages}$. We examine the latent language of three typical categories of models for Japanese processing: Llama2, an English-centric model; Swallow, an English-centric model with continued pre-training in Japanese; and LLM-jp, a model pre-trained on balanced English and Japanese corpora. Our empirical findings reveal that, unlike Llama2 which relies exclusively on English as the internal latent language, Japanese-specific Swallow and LLM-jp employ both Japanese and English, exhibiting dual internal latent languages. For any given target language, the model preferentially activates the latent language most closely related to it. In addition, we explore how intermediate layers respond to questions involving cultural conflicts between latent internal and target output languages. We further explore how the language identity shifts across layers while keeping consistent semantic meaning reflected in the intermediate layer representations. This study deepens the understanding of non-English-centric large language models, highlighting the intricate dynamics of language representation within their intermediate layers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10811v1-abstract-full').style.display = 'none'; document.getElementById('2408.10811v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07401">arXiv:2408.07401</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.07401">pdf</a>, <a href="https://arxiv.org/format/2408.07401">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> DataVisT5: A Pre-trained Language Model for Jointly Understanding Text and Data Visualization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhuoyue Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yuanfeng Song</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shuaimin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C+J">Chen Jason Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+R+C">Raymond Chi-Wing Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07401v2-abstract-short" style="display: inline;"> Data visualization (DV) is the fundamental and premise tool to improve the efficiency in conveying the insights behind the big data, which has been widely accepted in existing data-driven world. Task automation in DV, such as converting natural language queries to visualizations (i.e., text-to-vis), generating explanations from visualizations (i.e., vis-to-text), answering DV-related questions in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07401v2-abstract-full').style.display = 'inline'; document.getElementById('2408.07401v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07401v2-abstract-full" style="display: none;"> Data visualization (DV) is the fundamental and premise tool to improve the efficiency in conveying the insights behind the big data, which has been widely accepted in existing data-driven world. Task automation in DV, such as converting natural language queries to visualizations (i.e., text-to-vis), generating explanations from visualizations (i.e., vis-to-text), answering DV-related questions in free form (i.e. FeVisQA), and explicating tabular data (i.e., table-to-text), is vital for advancing the field. Despite their potential, the application of pre-trained language models (PLMs) like T5 and BERT in DV has been limited by high costs and challenges in handling cross-modal information, leading to few studies on PLMs for DV. We introduce DataVisT5, a novel PLM tailored for DV that enhances the T5 architecture through a hybrid objective pre-training and multi-task fine-tuning strategy, integrating text and DV datasets to effectively interpret cross-modal semantics. Extensive evaluations on public datasets show that DataVisT5 consistently outperforms current state-of-the-art models on various DV-related tasks. We anticipate that DataVisT5 will not only inspire further research on vertical PLMs but also expand the range of applications for PLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07401v2-abstract-full').style.display = 'none'; document.getElementById('2408.07401v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03178">arXiv:2408.03178</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.03178">pdf</a>, <a href="https://arxiv.org/format/2408.03178">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Object is Worth 64x64 Pixels: Generating 3D Object via Image Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yan%2C+X">Xingguang Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Han-Hung Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Ziyu Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+A+X">Angel X. Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03178v1-abstract-short" style="display: inline;"> We introduce a new approach for generating realistic 3D models with UV maps through a representation termed &#34;Object Images.&#34; This approach encapsulates surface geometry, appearance, and patch structures within a 64x64 pixel image, effectively converting complex 3D shapes into a more manageable 2D format. By doing so, we address the challenges of both geometric and semantic irregularity inherent in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03178v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03178v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03178v1-abstract-full" style="display: none;"> We introduce a new approach for generating realistic 3D models with UV maps through a representation termed &#34;Object Images.&#34; This approach encapsulates surface geometry, appearance, and patch structures within a 64x64 pixel image, effectively converting complex 3D shapes into a more manageable 2D format. By doing so, we address the challenges of both geometric and semantic irregularity inherent in polygonal meshes. This method allows us to use image generation models, such as Diffusion Transformers, directly for 3D shape generation. Evaluated on the ABO dataset, our generated shapes with patch structures achieve point cloud FID comparable to recent 3D generative models, while naturally supporting PBR material generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03178v1-abstract-full').style.display = 'none'; document.getElementById('2408.03178v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://omages.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02688">arXiv:2408.02688</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.02688">pdf</a>, <a href="https://arxiv.org/format/2408.02688">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> </div> </div> <p class="title is-5 mathjax"> A probabilistic framework for learning non-intrusive corrections to long-time climate simulations from short-time training data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sorensen%2C+B+B">Benedikt Barthel Sorensen</a>, <a href="/search/cs?searchtype=author&amp;query=Zepeda-N%C3%BA%C3%B1ez%2C+L">Leonardo Zepeda-N煤帽ez</a>, <a href="/search/cs?searchtype=author&amp;query=Lopez-Gomez%2C+I">Ignacio Lopez-Gomez</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z+Y">Zhong Yi Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Carver%2C+R">Rob Carver</a>, <a href="/search/cs?searchtype=author&amp;query=Sha%2C+F">Fei Sha</a>, <a href="/search/cs?searchtype=author&amp;query=Sapsis%2C+T">Themistoklis Sapsis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02688v2-abstract-short" style="display: inline;"> Chaotic systems, such as turbulent flows, are ubiquitous in science and engineering. However, their study remains a challenge due to the large range scales, and the strong interaction with other, often not fully understood, physics. As a consequence, the spatiotemporal resolution required for accurate simulation of these systems is typically computationally infeasible, particularly for application&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02688v2-abstract-full').style.display = 'inline'; document.getElementById('2408.02688v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02688v2-abstract-full" style="display: none;"> Chaotic systems, such as turbulent flows, are ubiquitous in science and engineering. However, their study remains a challenge due to the large range scales, and the strong interaction with other, often not fully understood, physics. As a consequence, the spatiotemporal resolution required for accurate simulation of these systems is typically computationally infeasible, particularly for applications of long-term risk assessment, such as the quantification of extreme weather risk due to climate change. While data-driven modeling offers some promise of alleviating these obstacles, the scarcity of high-quality simulations results in limited available data to train such models, which is often compounded by the lack of stability for long-horizon simulations. As such, the computational, algorithmic, and data restrictions generally imply that the probability of rare extreme events is not accurately captured. In this work we present a general strategy for training neural network models to non-intrusively correct under-resolved long-time simulations of chaotic systems. The approach is based on training a post-processing correction operator on under-resolved simulations nudged towards a high-fidelity reference. This enables us to learn the dynamics of the underlying system directly, which allows us to use very little training data, even when the statistics thereof are far from converged. Additionally, through the use of probabilistic network architectures we are able to leverage the uncertainty due to the limited training data to further improve extrapolation capabilities. We apply our framework to severely under-resolved simulations of quasi-geostrophic flow and demonstrate its ability to accurately predict the anisotropic statistics over time horizons more than 30 times longer than the data seen in training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02688v2-abstract-full').style.display = 'none'; document.getElementById('2408.02688v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00227">arXiv:2408.00227</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.00227">pdf</a>, <a href="https://arxiv.org/ps/2408.00227">ps</a>, <a href="https://arxiv.org/format/2408.00227">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Finding a Shortest $M$-link Path in a Monge Directed Acyclic Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+J+Z">Joy Z. Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00227v1-abstract-short" style="display: inline;"> A Monge directed acyclic graph (DAG) $G$ on the nodes $1,2,\cdots,N$ has edges $\left( i,j\right) $ for $1\leq i&lt;j\leq N$ carrying submodular edge-lengths. Finding a shortest $M$-link path from $1$ to $N$ in $G$ for any given $1&lt;M&lt;N-1$ has many applications. In this paper, we give a contract-and-conquer algorithm for this problem which runs in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00227v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00227v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00227v1-abstract-full" style="display: none;"> A Monge directed acyclic graph (DAG) $G$ on the nodes $1,2,\cdots,N$ has edges $\left( i,j\right) $ for $1\leq i&lt;j\leq N$ carrying submodular edge-lengths. Finding a shortest $M$-link path from $1$ to $N$ in $G$ for any given $1&lt;M&lt;N-1$ has many applications. In this paper, we give a contract-and-conquer algorithm for this problem which runs in $O\left( \sqrt{NM\left( N-M\right) \log\left( N-M\right) }\right) $ time and $O\left( N\right) $ space. It is the first $o\left( NM\right) $-time algorithm with linear space complexity, and its time complexity decreases with $M$ when $M\geq N/2$. In contrast, all previous strongly polynomial algorithms have running time growing with $M$. For both $O\left( poly\left( \log N\right) \right) $ and $N-O\left( poly\left( \log N\right) \right) $ regimes of $M$, our algorithm has running time $O\left( N\cdot poly\left( \log N\right) \right) $, which partially answers an open question rased in \cite{AST94} affirmatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00227v1-abstract-full').style.display = 'none'; document.getElementById('2408.00227v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21004">arXiv:2407.21004</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.21004">pdf</a>, <a href="https://arxiv.org/format/2407.21004">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Evolver: Chain-of-Evolution Prompting to Boost Large Multimodal Models for Hateful Meme Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Jinfa Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+J">Jinsheng Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+H">Hanjia Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+J">Jiebo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21004v1-abstract-short" style="display: inline;"> Recent advances show that two-stream approaches have achieved outstanding performance in hateful meme detection. However, hateful memes constantly evolve as new memes emerge by fusing progressive cultural ideas, making existing methods obsolete or ineffective. In this work, we explore the potential of Large Multimodal Models (LMMs) for hateful meme detection. To this end, we propose Evolver, which&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21004v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21004v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21004v1-abstract-full" style="display: none;"> Recent advances show that two-stream approaches have achieved outstanding performance in hateful meme detection. However, hateful memes constantly evolve as new memes emerge by fusing progressive cultural ideas, making existing methods obsolete or ineffective. In this work, we explore the potential of Large Multimodal Models (LMMs) for hateful meme detection. To this end, we propose Evolver, which incorporates LMMs via Chain-of-Evolution (CoE) Prompting, by integrating the evolution attribute and in-context information of memes. Specifically, Evolver simulates the evolving and expressing process of memes and reasons through LMMs in a step-by-step manner. First, an evolutionary pair mining module retrieves the top-k most similar memes in the external curated meme set with the input meme. Second, an evolutionary information extractor is designed to summarize the semantic regularities between the paired memes for prompting. Finally, a contextual relevance amplifier enhances the in-context hatefulness information to boost the search for evolutionary processes. Extensive experiments on public FHM, MAMI, and HarM datasets show that CoE prompting can be incorporated into existing LMMs to improve their performance. More encouragingly, it can serve as an interpretive tool to promote the understanding of the evolution of social memes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21004v1-abstract-full').style.display = 'none'; document.getElementById('2407.21004v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13623">arXiv:2407.13623</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.13623">pdf</a>, <a href="https://arxiv.org/format/2407.13623">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Scaling Laws with Vocabulary: Larger Models Deserve Larger Vocabularies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tao%2C+C">Chaofan Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qian Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Dou%2C+L">Longxu Dou</a>, <a href="/search/cs?searchtype=author&amp;query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+P">Ping Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+M">Min Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+N">Ngai Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13623v3-abstract-short" style="display: inline;"> Research on scaling large language models (LLMs) has primarily focused on model parameters and training data size, overlooking the role of vocabulary size. We investigate how vocabulary size impacts LLM scaling laws by training models ranging from 33M to 3B parameters on up to 500B characters with various vocabulary configurations. We propose three complementary approaches for predicting the compu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13623v3-abstract-full').style.display = 'inline'; document.getElementById('2407.13623v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13623v3-abstract-full" style="display: none;"> Research on scaling large language models (LLMs) has primarily focused on model parameters and training data size, overlooking the role of vocabulary size. We investigate how vocabulary size impacts LLM scaling laws by training models ranging from 33M to 3B parameters on up to 500B characters with various vocabulary configurations. We propose three complementary approaches for predicting the compute-optimal vocabulary size: IsoFLOPs analysis, derivative estimation, and parametric fit of the loss function. Our approaches converge on the conclusion that the optimal vocabulary size depends on the compute budget, with larger models requiring larger vocabularies. Most LLMs, however, use insufficient vocabulary sizes. For example, we predict that the optimal vocabulary size of Llama2-70B should have been at least 216K, 7 times larger than its vocabulary of 32K. We validate our predictions empirically by training models with 3B parameters across different FLOPs budgets. Adopting our predicted optimal vocabulary size consistently improves downstream performance over commonly used vocabulary sizes. By increasing the vocabulary size from the conventional 32K to 43K, we improve performance on ARC-Challenge from 29.1 to 32.0 with the same 2.3e21 FLOPs. Our work highlights the importance of jointly considering tokenization and model scaling for efficient pre-training. The code and demo are available at https://github.com/sail-sg/scaling-with-vocab and https://hf.co/spaces/sail/scaling-with-vocab-demo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13623v3-abstract-full').style.display = 'none'; document.getElementById('2407.13623v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04998">arXiv:2407.04998</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04998">pdf</a>, <a href="https://arxiv.org/format/2407.04998">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Solution for the 5th GCAIAC Zero-shot Referring Expression Comprehension Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+L">Longfei Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Guan%2C+Z">Zhihao Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhonghua Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04998v1-abstract-short" style="display: inline;"> This report presents a solution for the zero-shot referring expression comprehension task. Visual-language multimodal base models (such as CLIP, SAM) have gained significant attention in recent years as a cornerstone of mainstream research. One of the key applications of multimodal base models lies in their ability to generalize to zero-shot downstream tasks. Unlike traditional referring expressio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04998v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04998v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04998v1-abstract-full" style="display: none;"> This report presents a solution for the zero-shot referring expression comprehension task. Visual-language multimodal base models (such as CLIP, SAM) have gained significant attention in recent years as a cornerstone of mainstream research. One of the key applications of multimodal base models lies in their ability to generalize to zero-shot downstream tasks. Unlike traditional referring expression comprehension, zero-shot referring expression comprehension aims to apply pre-trained visual-language models directly to the task without specific training. Recent studies have enhanced the zero-shot performance of multimodal base models in referring expression comprehension tasks by introducing visual prompts. To address the zero-shot referring expression comprehension challenge, we introduced a combination of visual prompts and considered the influence of textual prompts, employing joint prediction tailored to the data characteristics. Ultimately, our approach achieved accuracy rates of 84.825 on the A leaderboard and 71.460 on the B leaderboard, securing the first position. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04998v1-abstract-full').style.display = 'none'; document.getElementById('2407.04998v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04996">arXiv:2407.04996</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04996">pdf</a>, <a href="https://arxiv.org/format/2407.04996">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Solution for the sequential task continual learning track of the 2nd Greater Bay Area International Algorithm Competition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pan%2C+S">Sishun Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixian Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tingmin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+L">Longfei Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+M">Mingxu Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhonghua Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04996v1-abstract-short" style="display: inline;"> This paper presents a data-free, parameter-isolation-based continual learning algorithm we developed for the sequential task continual learning track of the 2nd Greater Bay Area International Algorithm Competition. The method learns an independent parameter subspace for each task within the network&#39;s convolutional and linear layers and freezes the batch normalization layers after the first task. S&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04996v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04996v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04996v1-abstract-full" style="display: none;"> This paper presents a data-free, parameter-isolation-based continual learning algorithm we developed for the sequential task continual learning track of the 2nd Greater Bay Area International Algorithm Competition. The method learns an independent parameter subspace for each task within the network&#39;s convolutional and linear layers and freezes the batch normalization layers after the first task. Specifically, for domain incremental setting where all domains share a classification head, we freeze the shared classification head after first task is completed, effectively solving the issue of catastrophic forgetting. Additionally, facing the challenge of domain incremental settings without providing a task identity, we designed an inference task identity strategy, selecting an appropriate mask matrix for each sample. Furthermore, we introduced a gradient supplementation strategy to enhance the importance of unselected parameters for the current task, facilitating learning for new tasks. We also implemented an adaptive importance scoring strategy that dynamically adjusts the amount of parameters to optimize single-task performance while reducing parameter usage. Moreover, considering the limitations of storage space and inference time, we designed a mask matrix compression strategy to save storage space and improve the speed of encryption and decryption of the mask matrix. Our approach does not require expanding the core network or using external auxiliary networks or data, and performs well under both task incremental and domain incremental settings. This solution ultimately won a second-place prize in the competition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04996v1-abstract-full').style.display = 'none'; document.getElementById('2407.04996v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04994">arXiv:2407.04994</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04994">pdf</a>, <a href="https://arxiv.org/format/2407.04994">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Solution for Language-Enhanced Image New Category Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Haonan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chao%2C+D">Dian Chao</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiangyu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhonghua Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04994v1-abstract-short" style="display: inline;"> Treating texts as images, combining prompts with textual labels for prompt tuning, and leveraging the alignment properties of CLIP have been successfully applied in zero-shot multi-label image recognition. Nonetheless, relying solely on textual labels to store visual information is insufficient for representing the diversity of visual objects. In this paper, we propose reversing the training proce&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04994v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04994v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04994v1-abstract-full" style="display: none;"> Treating texts as images, combining prompts with textual labels for prompt tuning, and leveraging the alignment properties of CLIP have been successfully applied in zero-shot multi-label image recognition. Nonetheless, relying solely on textual labels to store visual information is insufficient for representing the diversity of visual objects. In this paper, we propose reversing the training process of CLIP and introducing the concept of Pseudo Visual Prompts. These prompts are initialized for each object category and pre-trained on large-scale, low-cost sentence data generated by large language models. This process mines the aligned visual information in CLIP and stores it in class-specific visual prompts. We then employ contrastive learning to transfer the stored visual information to the textual labels, enhancing their visual representation capacity. Additionally, we introduce a dual-adapter module that simultaneously leverages knowledge from the original CLIP and new learning knowledge derived from downstream datasets. Benefiting from the pseudo visual prompts, our method surpasses the state-of-the-art not only on clean annotated text data but also on pseudo text data generated by large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04994v1-abstract-full').style.display = 'none'; document.getElementById('2407.04994v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04991">arXiv:2407.04991</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04991">pdf</a>, <a href="https://arxiv.org/format/2407.04991">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Solution for the AIGC Inference Performance Optimization Competition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pan%2C+S">Sishun Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Haonan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhonghua Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04991v1-abstract-short" style="display: inline;"> In recent years, the rapid advancement of large-scale pre-trained language models based on transformer architectures has revolutionized natural language processing tasks. Among these, ChatGPT has gained widespread popularity, demonstrating human-level conversational abilities and attracting over 100 million monthly users by late 2022. Concurrently, Baidu&#39;s commercial deployment of the Ernie Wenxin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04991v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04991v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04991v1-abstract-full" style="display: none;"> In recent years, the rapid advancement of large-scale pre-trained language models based on transformer architectures has revolutionized natural language processing tasks. Among these, ChatGPT has gained widespread popularity, demonstrating human-level conversational abilities and attracting over 100 million monthly users by late 2022. Concurrently, Baidu&#39;s commercial deployment of the Ernie Wenxin model has significantly enhanced marketing effectiveness through AI-driven technologies. This paper focuses on optimizing high-performance inference for Ernie models, emphasizing GPU acceleration and leveraging the Paddle inference framework. We employ techniques such as Faster Transformer for efficient model processing, embedding layer pruning to reduce computational overhead, and FP16 half-precision inference for enhanced computational efficiency. Additionally, our approach integrates efficient data handling strategies using multi-process parallel processing to minimize latency. Experimental results demonstrate that our optimized solution achieves up to an 8.96x improvement in inference speed compared to standard methods, while maintaining competitive performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04991v1-abstract-full').style.display = 'none'; document.getElementById('2407.04991v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03963">arXiv:2407.03963</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.03963">pdf</a>, <a href="https://arxiv.org/format/2407.03963">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLM-jp: A Cross-organizational Project for the Research and Development of Fully Open Japanese LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=LLM-jp"> LLM-jp</a>, <a href="/search/cs?searchtype=author&amp;query=%3A"> :</a>, <a href="/search/cs?searchtype=author&amp;query=Aizawa%2C+A">Akiko Aizawa</a>, <a href="/search/cs?searchtype=author&amp;query=Aramaki%2C+E">Eiji Aramaki</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Bowen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+F">Fei Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Deguchi%2C+H">Hiroyuki Deguchi</a>, <a href="/search/cs?searchtype=author&amp;query=Enomoto%2C+R">Rintaro Enomoto</a>, <a href="/search/cs?searchtype=author&amp;query=Fujii%2C+K">Kazuki Fujii</a>, <a href="/search/cs?searchtype=author&amp;query=Fukumoto%2C+K">Kensuke Fukumoto</a>, <a href="/search/cs?searchtype=author&amp;query=Fukushima%2C+T">Takuya Fukushima</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+N">Namgi Han</a>, <a href="/search/cs?searchtype=author&amp;query=Harada%2C+Y">Yuto Harada</a>, <a href="/search/cs?searchtype=author&amp;query=Hashimoto%2C+C">Chikara Hashimoto</a>, <a href="/search/cs?searchtype=author&amp;query=Hiraoka%2C+T">Tatsuya Hiraoka</a>, <a href="/search/cs?searchtype=author&amp;query=Hisada%2C+S">Shohei Hisada</a>, <a href="/search/cs?searchtype=author&amp;query=Hosokawa%2C+S">Sosuke Hosokawa</a>, <a href="/search/cs?searchtype=author&amp;query=Jie%2C+L">Lu Jie</a>, <a href="/search/cs?searchtype=author&amp;query=Kamata%2C+K">Keisuke Kamata</a>, <a href="/search/cs?searchtype=author&amp;query=Kanazawa%2C+T">Teruhito Kanazawa</a>, <a href="/search/cs?searchtype=author&amp;query=Kanezashi%2C+H">Hiroki Kanezashi</a>, <a href="/search/cs?searchtype=author&amp;query=Kataoka%2C+H">Hiroshi Kataoka</a>, <a href="/search/cs?searchtype=author&amp;query=Katsumata%2C+S">Satoru Katsumata</a>, <a href="/search/cs?searchtype=author&amp;query=Kawahara%2C+D">Daisuke Kawahara</a>, <a href="/search/cs?searchtype=author&amp;query=Kawano%2C+S">Seiya Kawano</a> , et al. (57 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03963v1-abstract-short" style="display: inline;"> This paper introduces LLM-jp, a cross-organizational project for the research and development of Japanese large language models (LLMs). LLM-jp aims to develop open-source and strong Japanese LLMs, and as of this writing, more than 1,500 participants from academia and industry are working together for this purpose. This paper presents the background of the establishment of LLM-jp, summaries of its&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03963v1-abstract-full').style.display = 'inline'; document.getElementById('2407.03963v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03963v1-abstract-full" style="display: none;"> This paper introduces LLM-jp, a cross-organizational project for the research and development of Japanese large language models (LLMs). LLM-jp aims to develop open-source and strong Japanese LLMs, and as of this writing, more than 1,500 participants from academia and industry are working together for this purpose. This paper presents the background of the establishment of LLM-jp, summaries of its activities, and technical reports on the LLMs developed by LLM-jp. For the latest activities, visit https://llm-jp.nii.ac.jp/en/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03963v1-abstract-full').style.display = 'none'; document.getElementById('2407.03963v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01081">arXiv:2407.01081</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.01081">pdf</a>, <a href="https://arxiv.org/format/2407.01081">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CVLUE: A New Benchmark Dataset for Chinese Vision-Language Understanding Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yijun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+F">Fei Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Chen Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kexin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhiguo Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Che%2C+W">Wanxiang Che</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01081v1-abstract-short" style="display: inline;"> Despite the rapid development of Chinese vision-language models (VLMs), most existing Chinese vision-language (VL) datasets are constructed on Western-centric images from existing English VL datasets. The cultural bias in the images makes these datasets unsuitable for evaluating VLMs in Chinese culture. To remedy this issue, we present a new Chinese Vision- Language Understanding Evaluation (CVLUE&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01081v1-abstract-full').style.display = 'inline'; document.getElementById('2407.01081v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01081v1-abstract-full" style="display: none;"> Despite the rapid development of Chinese vision-language models (VLMs), most existing Chinese vision-language (VL) datasets are constructed on Western-centric images from existing English VL datasets. The cultural bias in the images makes these datasets unsuitable for evaluating VLMs in Chinese culture. To remedy this issue, we present a new Chinese Vision- Language Understanding Evaluation (CVLUE) benchmark dataset, where the selection of object categories and images is entirely driven by Chinese native speakers, ensuring that the source images are representative of Chinese culture. The benchmark contains four distinct VL tasks ranging from image-text retrieval to visual question answering, visual grounding and visual dialogue. We present a detailed statistical analysis of CVLUE and provide a baseline performance analysis with several open-source multilingual VLMs on CVLUE and its English counterparts to reveal their performance gap between English and Chinese. Our in-depth category-level analysis reveals a lack of Chinese cultural knowledge in existing VLMs. We also find that fine-tuning on Chinese culture-related VL datasets effectively enhances VLMs&#39; understanding of Chinese culture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01081v1-abstract-full').style.display = 'none'; document.getElementById('2407.01081v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18139">arXiv:2406.18139</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.18139">pdf</a>, <a href="https://arxiv.org/format/2406.18139">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LOOK-M: Look-Once Optimization in KV Cache for Efficient Multimodal Long-Context Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Ziang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Che Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Jinfa Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zhihong Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+P">Peng Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Longyue Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18139v1-abstract-short" style="display: inline;"> Long-context Multimodal Large Language Models (MLLMs) demand substantial computational resources for inference as the growth of their multimodal Key-Value (KV) cache, in response to increasing input lengths, challenges memory and time efficiency. Unlike single-modality LLMs that manage only textual contexts, the KV cache of long-context MLLMs includes representations from multiple images with temp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18139v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18139v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18139v1-abstract-full" style="display: none;"> Long-context Multimodal Large Language Models (MLLMs) demand substantial computational resources for inference as the growth of their multimodal Key-Value (KV) cache, in response to increasing input lengths, challenges memory and time efficiency. Unlike single-modality LLMs that manage only textual contexts, the KV cache of long-context MLLMs includes representations from multiple images with temporal and spatial relationships and related textual contexts. The predominance of image tokens means traditional optimizations for LLMs&#39; KV caches are unsuitable for multimodal long-context settings, and no prior works have addressed this challenge. In this work, we introduce LOOK-M, a pioneering, fine-tuning-free approach that efficiently reduces the multimodal KV cache size while maintaining performance comparable to a full cache. We observe that during prompt prefill, the model prioritizes more textual attention over image features, and based on the multimodal interaction observation, a new proposed text-prior method is explored to compress the KV cache. Furthermore, to mitigate the degradation of image contextual information, we propose several compensatory strategies using KV pairs merging. LOOK-M demonstrates that with a significant reduction in KV Cache memory usage, such as reducing it by 80% in some cases, it not only achieves up to 1.5x faster decoding but also maintains or even enhances performance across a variety of long context multimodal tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18139v1-abstract-full').style.display = 'none'; document.getElementById('2406.18139v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14278">arXiv:2406.14278</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.14278">pdf</a>, <a href="https://arxiv.org/ps/2406.14278">ps</a>, <a href="https://arxiv.org/format/2406.14278">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Efficient Deterministic Algorithms for Maximizing Symmetric Submodular Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zongqi Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jialin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+X">Xiaoming Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhijie Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14278v1-abstract-short" style="display: inline;"> Symmetric submodular maximization is an important class of combinatorial optimization problems, including MAX-CUT on graphs and hyper-graphs. The state-of-the-art algorithm for the problem over general constraints has an approximation ratio of $0.432$. The algorithm applies the canonical continuous greedy technique that involves a sampling process. It, therefore, suffers from high query complexity&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14278v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14278v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14278v1-abstract-full" style="display: none;"> Symmetric submodular maximization is an important class of combinatorial optimization problems, including MAX-CUT on graphs and hyper-graphs. The state-of-the-art algorithm for the problem over general constraints has an approximation ratio of $0.432$. The algorithm applies the canonical continuous greedy technique that involves a sampling process. It, therefore, suffers from high query complexity and is inherently randomized. In this paper, we present several efficient deterministic algorithms for maximizing a symmetric submodular function under various constraints. Specifically, for the cardinality constraint, we design a deterministic algorithm that attains a $0.432$ ratio and uses $O(kn)$ queries. Previously, the best deterministic algorithm attains a $0.385-蔚$ ratio and uses $O\left(kn (\frac{10}{9蔚})^{\frac{20}{9蔚}-1}\right)$ queries. For the matroid constraint, we design a deterministic algorithm that attains a $1/3-蔚$ ratio and uses $O(kn\log 蔚^{-1})$ queries. Previously, the best deterministic algorithm can also attain $1/3-蔚$ ratio but it uses much larger $O(蔚^{-1}n^4)$ queries. For the packing constraints with a large width, we design a deterministic algorithm that attains a $0.432-蔚$ ratio and uses $O(n^2)$ queries. To the best of our knowledge, there is no deterministic algorithm for the constraint previously. The last algorithm can be adapted to attain a $0.432$ ratio for single knapsack constraint using $O(n^4)$ queries. Previously, the best deterministic algorithm attains a $0.316-蔚$ ratio and uses $\widetilde{O}(n^3)$ queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14278v1-abstract-full').style.display = 'none'; document.getElementById('2406.14278v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13060">arXiv:2406.13060</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13060">pdf</a>, <a href="https://arxiv.org/format/2406.13060">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Scale-Translation Equivariant Network for Oceanic Internal Solitary Wave Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhang Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xudong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13060v1-abstract-short" style="display: inline;"> Internal solitary waves (ISWs) are gravity waves that are often observed in the interior ocean rather than the surface. They hold significant importance due to their capacity to carry substantial energy, thus influence pollutant transport, oil platform operations, submarine navigation, etc. Researchers have studied ISWs through optical images, synthetic aperture radar (SAR) images, and altimeter d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13060v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13060v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13060v1-abstract-full" style="display: none;"> Internal solitary waves (ISWs) are gravity waves that are often observed in the interior ocean rather than the surface. They hold significant importance due to their capacity to carry substantial energy, thus influence pollutant transport, oil platform operations, submarine navigation, etc. Researchers have studied ISWs through optical images, synthetic aperture radar (SAR) images, and altimeter data from remote sensing instruments. However, cloud cover in optical remote sensing images variably obscures ground information, leading to blurred or missing surface observations. As such, this paper aims at altimeter-based machine learning solutions to automatically locate ISWs. The challenges, however, lie in the following two aspects: 1) the altimeter data has low resolution, which requires a strong machine learner; 2) labeling data is extremely labor-intensive, leading to very limited data for training. In recent years, the grand progress of deep learning demonstrates strong learning capacity given abundant data. Besides, more recent studies on efficient learning and self-supervised learning laid solid foundations to tackle the aforementioned challenges. In this paper, we propose to inject prior knowledge to achieve a strong and efficient learner. Specifically, intrinsic patterns in altimetry data are efficiently captured using a scale-translation equivariant convolutional neural network (ST-ECNN). By considering inherent symmetries in neural network design, ST-ECNN achieves higher efficiency and better performance than baseline models. Furthermore, we also introduce prior knowledge from massive unsupervised data to enhance our solution using the SimCLR framework for pre-training. Our final solution achieves an overall better performance than baselines on our handcrafted altimetry dataset. Data and codes are available at https://github.com/ZhangWan-byte/Internal_Solitary_Wave_Localization . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13060v1-abstract-full').style.display = 'none'; document.getElementById('2406.13060v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13035">arXiv:2406.13035</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13035">pdf</a>, <a href="https://arxiv.org/format/2406.13035">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> D2O: Dynamic Discriminative Operations for Efficient Generative Inference of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xinjian Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xin%2C+Y">Yi Xin</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+C">Chaofan Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zhihong Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+S">Siqi Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+J">Jing Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Mi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13035v2-abstract-short" style="display: inline;"> Efficient inference in Large Language Models (LLMs) is impeded by the growing memory demands of key-value (KV) caching, especially for longer sequences. Traditional KV cache eviction strategies, which prioritize less critical KV-pairs based on attention scores, often degrade generation quality, leading to issues such as context loss or hallucinations. To address this, we introduce Dynamic Discrimi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13035v2-abstract-full').style.display = 'inline'; document.getElementById('2406.13035v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13035v2-abstract-full" style="display: none;"> Efficient inference in Large Language Models (LLMs) is impeded by the growing memory demands of key-value (KV) caching, especially for longer sequences. Traditional KV cache eviction strategies, which prioritize less critical KV-pairs based on attention scores, often degrade generation quality, leading to issues such as context loss or hallucinations. To address this, we introduce Dynamic Discriminative Operations (D2O), a novel method that utilizes two-level discriminative strategies to optimize KV cache size without fine-tuning, while preserving essential context. Initially, by observing varying densities of attention weights between shallow and deep layers, we use this insight to determine which layers should avoid excessive eviction to minimize information loss. Subsequently, for the eviction strategy in each layer, D2O innovatively incorporates a compensation mechanism that maintains a similarity threshold to re-discriminate the importance of previously discarded tokens, determining whether they should be recalled and merged with similar tokens. Our approach not only achieves significant memory savings and enhances inference throughput by more than 3 times but also maintains high-quality long-text generation. Extensive experiments across various benchmarks and LLM architectures have demonstrated that D2O significantly enhances performance with a constrained KV cache budget. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13035v2-abstract-full').style.display = 'none'; document.getElementById('2406.13035v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07146">arXiv:2406.07146</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.07146">pdf</a>, <a href="https://arxiv.org/format/2406.07146">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking and Boosting Radiology Report Generation for 3D High-Resolution Medical Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Che Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuqi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haozhe Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+K">Kangyu Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Mi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Arcucci%2C+R">Rossella Arcucci</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07146v2-abstract-short" style="display: inline;"> Automatic radiology report generation can significantly benefit the labor-intensive process of report writing by radiologists, especially for 3D radiographs like CT scans, which are crucial for broad clinical diagnostics yet underexplored compared to 2D radiographs. Existing methods often handle 3D volumes either slice-wise or with aggressive downsampling due to current GPU memory limitations, whi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07146v2-abstract-full').style.display = 'inline'; document.getElementById('2406.07146v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07146v2-abstract-full" style="display: none;"> Automatic radiology report generation can significantly benefit the labor-intensive process of report writing by radiologists, especially for 3D radiographs like CT scans, which are crucial for broad clinical diagnostics yet underexplored compared to 2D radiographs. Existing methods often handle 3D volumes either slice-wise or with aggressive downsampling due to current GPU memory limitations, which results in a loss of the inherent 3D nature and critical details. To overcome these issues, we introduce a novel framework that efficiently and effectively generates radiology reports for high-resolution (HR) 3D volumes, based on large language models (LLMs). Specifically, our framework utilizes low-resolution (LR) visual tokens as queries to mine information from HR tokens, preserving detailed HR information while reducing computational costs by only processing HR informed LR visual queries. Further benefiting the field, we curate and release BIMCV-RG, a new dataset with 5,328 HR 3D volumes and paired reports, establishing the first benchmarks for report generation from 3D HR medical images. Our method consistently surpasses existing methods on this benchmark across three different settings: normal-resolution, high-resolution inputs, and zero-shot domain transfer, all at an acceptable computational cost, trainable on a single A100-80G. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07146v2-abstract-full').style.display = 'none'; document.getElementById('2406.07146v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04835">arXiv:2406.04835</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.04835">pdf</a>, <a href="https://arxiv.org/format/2406.04835">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SLR: Learning Quadruped Locomotion without Privileged Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shiyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zeyu Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+S">Shiyang Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weiyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Debing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Farrukh%2C+F+U+D">Fasih Ud Din Farrukh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04835v2-abstract-short" style="display: inline;"> The recent mainstream reinforcement learning control for quadruped robots often relies on privileged information, demanding meticulous selection and precise estimation, thereby imposing constraints on the development process. This work proposes a Self-learning Latent Representation (SLR) method, which achieves high-performance control policy learning without the need for privileged information. To&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04835v2-abstract-full').style.display = 'inline'; document.getElementById('2406.04835v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04835v2-abstract-full" style="display: none;"> The recent mainstream reinforcement learning control for quadruped robots often relies on privileged information, demanding meticulous selection and precise estimation, thereby imposing constraints on the development process. This work proposes a Self-learning Latent Representation (SLR) method, which achieves high-performance control policy learning without the need for privileged information. To enhance the credibility of the proposed method&#39;s evaluation, SLR was directly compared with state-of-the-art algorithms using their open-source code repositories and original configuration parameters. Remarkably, SLR surpasses the performance of previous methods using only limited proprioceptive data, demonstrating significant potential for future applications. Ultimately, the trained policy and encoder empower the quadruped robot to traverse various challenging terrains. Videos of our results can be found on our website: https://11chens.github.io/SLR/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04835v2-abstract-full').style.display = 'none'; document.getElementById('2406.04835v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wan%2C+Z&amp;start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10