Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 668 results for author: <span class="mathjax">Guo, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Guo%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Guo, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Guo%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Guo, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Guo%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Guo%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12888">arXiv:2411.12888</a> <span> [<a href="https://arxiv.org/pdf/2411.12888">pdf</a>, <a href="https://arxiv.org/format/2411.12888">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> An Experimental Multi-Band Channel Characterization in the Upper Mid-Band </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bomfin%2C+R">Roberto Bomfin</a>, <a href="/search/cs?searchtype=author&query=Bazzi%2C+A">Ahmad Bazzi</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hao Guo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hyeongtaek Lee</a>, <a href="/search/cs?searchtype=author&query=Mezzavilla%2C+M">Marco Mezzavilla</a>, <a href="/search/cs?searchtype=author&query=Rangan%2C+S">Sundeep Rangan</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J">Junil Choi</a>, <a href="/search/cs?searchtype=author&query=Chafii%2C+M">Marwa Chafii</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12888v1-abstract-short" style="display: inline;"> The following paper provides a multi-band channel measurement analysis on the frequency range (FR)3. This study focuses on the FR3 low frequencies 6.5 GHz and 8.75 GHz with a setup tailored to the context of integrated sensing and communication (ISAC), where the data are collected with and without the presence of a target. A method based on multiple signal classification (MUSIC) is used to refine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12888v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12888v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12888v1-abstract-full" style="display: none;"> The following paper provides a multi-band channel measurement analysis on the frequency range (FR)3. This study focuses on the FR3 low frequencies 6.5 GHz and 8.75 GHz with a setup tailored to the context of integrated sensing and communication (ISAC), where the data are collected with and without the presence of a target. A method based on multiple signal classification (MUSIC) is used to refine the delays of the channel impulse response estimates. The results reveal that the channel at the lower frequency 6.5 GHz has additional distinguishable multipath components in the presence of the target, while the one associated with the higher frequency 8.75 GHz has more blockage. The set of results reported in this paper serves as a benchmark for future multi-band studies in the FR3 spectrum. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12888v1-abstract-full').style.display = 'none'; document.getElementById('2411.12888v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12825">arXiv:2411.12825</a> <span> [<a href="https://arxiv.org/pdf/2411.12825">pdf</a>, <a href="https://arxiv.org/format/2411.12825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> TopoCode: Topologically Informed Error Detection and Correction in Communication Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongzhi Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12825v1-abstract-short" style="display: inline;"> Traditional error detection and correction codes focus on bit-level fidelity, which is insufficient for emerging technologies like eXtended Reality (XR) and holographic communications requiring high-data-rate, low-latency systems. Bit-level metrics cannot comprehensively evaluate Quality-of-Service (QoS) in these scenarios. This letter proposes TopoCode which leverages Topological Data Analysis (T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12825v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12825v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12825v1-abstract-full" style="display: none;"> Traditional error detection and correction codes focus on bit-level fidelity, which is insufficient for emerging technologies like eXtended Reality (XR) and holographic communications requiring high-data-rate, low-latency systems. Bit-level metrics cannot comprehensively evaluate Quality-of-Service (QoS) in these scenarios. This letter proposes TopoCode which leverages Topological Data Analysis (TDA) and persistent homology to encode topological information for message-level error detection and correction. It introduces minimal redundancy while enabling effective data reconstruction, especially in low Signal-to-Noise Ratio (SNR) conditions. TopoCode offers a promising approach to meet the demands of next-generation communication systems prioritizing semantic accuracy and message-level integrity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12825v1-abstract-full').style.display = 'none'; document.getElementById('2411.12825v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11335">arXiv:2411.11335</a> <span> [<a href="https://arxiv.org/pdf/2411.11335">pdf</a>, <a href="https://arxiv.org/format/2411.11335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Video-to-Task Learning via Motion-Guided Attention for Few-Shot Action Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanyu Guo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wanchuan Yu</a>, <a href="/search/cs?searchtype=author&query=Que%2C+S">Suzhou Que</a>, <a href="/search/cs?searchtype=author&query=Du%2C+K">Kaiwen Du</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yan Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanzi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11335v1-abstract-short" style="display: inline;"> In recent years, few-shot action recognition has achieved remarkable performance through spatio-temporal relation modeling. Although a wide range of spatial and temporal alignment modules have been proposed, they primarily address spatial or temporal misalignments at the video level, while the spatio-temporal relationships across different videos at the task level remain underexplored. Recent stud… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11335v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11335v1-abstract-full" style="display: none;"> In recent years, few-shot action recognition has achieved remarkable performance through spatio-temporal relation modeling. Although a wide range of spatial and temporal alignment modules have been proposed, they primarily address spatial or temporal misalignments at the video level, while the spatio-temporal relationships across different videos at the task level remain underexplored. Recent studies utilize class prototypes to learn task-specific features but overlook the spatio-temporal relationships across different videos at the task level, especially in the spatial dimension, where these relationships provide rich information. In this paper, we propose a novel Dual Motion-Guided Attention Learning method (called DMGAL) for few-shot action recognition, aiming to learn the spatio-temporal relationships from the video-specific to the task-specific level. To achieve this, we propose a carefully designed Motion-Guided Attention (MGA) method to identify and correlate motion-related region features from the video level to the task level. Specifically, the Self Motion-Guided Attention module (S-MGA) achieves spatio-temporal relation modeling at the video level by identifying and correlating motion-related region features between different frames within a video. The Cross Motion-Guided Attention module (C-MGA) identifies and correlates motion-related region features between frames of different videos within a specific task to achieve spatio-temporal relationships at the task level. This approach enables the model to construct class prototypes that fully incorporate spatio-temporal relationships from the video-specific level to the task-specific level. We validate the effectiveness of our DMGAL method by employing both fully fine-tuning and adapter-tuning paradigms. The models developed using these paradigms are termed DMGAL-FT and DMGAL-Adapter, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11335v1-abstract-full').style.display = 'none'; document.getElementById('2411.11335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11110">arXiv:2411.11110</a> <span> [<a href="https://arxiv.org/pdf/2411.11110">pdf</a>, <a href="https://arxiv.org/format/2411.11110">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Retinal Vessel Segmentation via Neuron Programming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tingting Wu</a>, <a href="/search/cs?searchtype=author&query=Min%2C+R">Ruyi Min</a>, <a href="/search/cs?searchtype=author&query=Song%2C+P">Peixuan Song</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hengtao Guo</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+T">Tieyong Zeng</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+F">Feng-Lei Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11110v1-abstract-short" style="display: inline;"> The accurate segmentation of retinal blood vessels plays a crucial role in the early diagnosis and treatment of various ophthalmic diseases. Designing a network model for this task requires meticulous tuning and extensive experimentation to handle the tiny and intertwined morphology of retinal blood vessels. To tackle this challenge, Neural Architecture Search (NAS) methods are developed to fully… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11110v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11110v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11110v1-abstract-full" style="display: none;"> The accurate segmentation of retinal blood vessels plays a crucial role in the early diagnosis and treatment of various ophthalmic diseases. Designing a network model for this task requires meticulous tuning and extensive experimentation to handle the tiny and intertwined morphology of retinal blood vessels. To tackle this challenge, Neural Architecture Search (NAS) methods are developed to fully explore the space of potential network architectures and go after the most powerful one. Inspired by neuronal diversity which is the biological foundation of all kinds of intelligent behaviors in our brain, this paper introduces a novel and foundational approach to neural network design, termed ``neuron programming'', to automatically search neuronal types into a network to enhance a network's representation ability at the neuronal level, which is complementary to architecture-level enhancement done by NAS. Additionally, to mitigate the time and computational intensity of neuron programming, we develop a hypernetwork that leverages the search-derived architectural information to predict optimal neuronal configurations. Comprehensive experiments validate that neuron programming can achieve competitive performance in retinal blood segmentation, demonstrating the strong potential of neuronal diversity in medical image analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11110v1-abstract-full').style.display = 'none'; document.getElementById('2411.11110v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10187">arXiv:2411.10187</a> <span> [<a href="https://arxiv.org/pdf/2411.10187">pdf</a>, <a href="https://arxiv.org/format/2411.10187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Try-On-Adapter: A Simple and Flexible Try-On Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanzhong Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+C">Cheng Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+R">Ruxue Wen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+P">Pingzhong Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingdong Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10187v1-abstract-short" style="display: inline;"> Image-based virtual try-on, widely used in online shopping, aims to generate images of a naturally dressed person conditioned on certain garments, providing significant research and commercial potential. A key challenge of try-on is to generate realistic images of the model wearing the garments while preserving the details of the garments. Previous methods focus on masking certain parts of the ori… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10187v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10187v1-abstract-full" style="display: none;"> Image-based virtual try-on, widely used in online shopping, aims to generate images of a naturally dressed person conditioned on certain garments, providing significant research and commercial potential. A key challenge of try-on is to generate realistic images of the model wearing the garments while preserving the details of the garments. Previous methods focus on masking certain parts of the original model's standing image, and then inpainting on masked areas to generate realistic images of the model wearing corresponding reference garments, which treat the try-on task as an inpainting task. However, such implements require the user to provide a complete, high-quality standing image, which is user-unfriendly in practical applications. In this paper, we propose Try-On-Adapter (TOA), an outpainting paradigm that differs from the existing inpainting paradigm. Our TOA can preserve the given face and garment, naturally imagine the rest parts of the image, and provide flexible control ability with various conditions, e.g., garment properties and human pose. In the experiments, TOA shows excellent performance on the virtual try-on task even given relatively low-quality face and garment images in qualitative comparisons. Additionally, TOA achieves the state-of-the-art performance of FID scores 5.56 and 7.23 for paired and unpaired on the VITON-HD dataset in quantitative comparisons. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10187v1-abstract-full').style.display = 'none'; document.getElementById('2411.10187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Image virtual try-on, 7 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08451">arXiv:2411.08451</a> <span> [<a href="https://arxiv.org/pdf/2411.08451">pdf</a>, <a href="https://arxiv.org/format/2411.08451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AD-DINO: Attention-Dynamic DINO for Distance-Aware Embodied Reference Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hao Guo</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wei Fan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+B">Baichun Wei</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianfei Zhu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jin Tian</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+C">Chunzhi Yi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Feng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08451v1-abstract-short" style="display: inline;"> Embodied reference understanding is crucial for intelligent agents to predict referents based on human intention through gesture signals and language descriptions. This paper introduces the Attention-Dynamic DINO, a novel framework designed to mitigate misinterpretations of pointing gestures across various interaction contexts. Our approach integrates visual and textual features to simultaneously… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08451v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08451v1-abstract-full" style="display: none;"> Embodied reference understanding is crucial for intelligent agents to predict referents based on human intention through gesture signals and language descriptions. This paper introduces the Attention-Dynamic DINO, a novel framework designed to mitigate misinterpretations of pointing gestures across various interaction contexts. Our approach integrates visual and textual features to simultaneously predict the target object's bounding box and the attention source in pointing gestures. Leveraging the distance-aware nature of nonverbal communication in visual perspective taking, we extend the virtual touch line mechanism and propose an attention-dynamic touch line to represent referring gesture based on interactive distances. The combination of this distance-aware approach and independent prediction of the attention source, enhances the alignment between objects and the gesture represented line. Extensive experiments on the YouRefIt dataset demonstrate the efficacy of our gesture information understanding method in significantly improving task performance. Our model achieves 76.4% accuracy at the 0.25 IoU threshold and, notably, surpasses human performance at the 0.75 IoU threshold, marking a first in this domain. Comparative experiments with distance-unaware understanding methods from previous research further validate the superiority of the Attention-Dynamic Touch Line across diverse contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08451v1-abstract-full').style.display = 'none'; document.getElementById('2411.08451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07279">arXiv:2411.07279</a> <span> [<a href="https://arxiv.org/pdf/2411.07279">pdf</a>, <a href="https://arxiv.org/format/2411.07279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Surprising Effectiveness of Test-Time Training for Abstract Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aky%C3%BCrek%2C+E">Ekin Aky眉rek</a>, <a href="/search/cs?searchtype=author&query=Damani%2C+M">Mehul Damani</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Linlu Qiu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Han Guo</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yoon Kim</a>, <a href="/search/cs?searchtype=author&query=Andreas%2C+J">Jacob Andreas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07279v1-abstract-short" style="display: inline;"> Language models have shown impressive performance on tasks within their training distribution, but often struggle with novel problems requiring complex reasoning. We investigate the effectiveness of test-time training (TTT) -- updating model parameters temporarily during inference using a loss derived from input data -- as a mechanism for improving models' reasoning capabilities, using the Abstrac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07279v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07279v1-abstract-full" style="display: none;"> Language models have shown impressive performance on tasks within their training distribution, but often struggle with novel problems requiring complex reasoning. We investigate the effectiveness of test-time training (TTT) -- updating model parameters temporarily during inference using a loss derived from input data -- as a mechanism for improving models' reasoning capabilities, using the Abstraction and Reasoning Corpus (ARC) as a benchmark. Through systematic experimentation, we identify three crucial components for successful TTT: (1) initial finetuning on similar tasks (2) auxiliary task format and augmentations (3) per-instance training. TTT significantly improves performance on ARC tasks, achieving up to 6x improvement in accuracy compared to base fine-tuned models; applying TTT to an 8B-parameter language model, we achieve 53% accuracy on the ARC's public validation set, improving the state-of-the-art by nearly 25% for public and purely neural approaches. By ensembling our method with recent program generation approaches, we get SoTA public validation accuracy of 61.9%, matching the average human score. Our findings suggest that explicit symbolic search is not the only path to improved abstract reasoning in neural language models; additional test-time applied to continued training on few-shot examples can also be extremely effective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07279v1-abstract-full').style.display = 'none'; document.getElementById('2411.07279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07140">arXiv:2411.07140</a> <span> [<a href="https://arxiv.org/pdf/2411.07140">pdf</a>, <a href="https://arxiv.org/format/2411.07140">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Chinese SimpleQA: A Chinese Factuality Evaluation for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yancheng He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shilong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yingshui Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weixun Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hui Huang</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+X">Xingyuan Bu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chengwei Hu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Boren Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhuoran Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuepeng Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Dekai Sun</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shirong Lin</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhicheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaoyong Zhu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wenbo Su</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07140v2-abstract-short" style="display: inline;"> New LLM evaluation benchmarks are important to align with the rapid development of Large Language Models (LLMs). In this work, we present Chinese SimpleQA, the first comprehensive Chinese benchmark to evaluate the factuality ability of language models to answer short questions, and Chinese SimpleQA mainly has five properties (i.e., Chinese, Diverse, High-quality, Static, Easy-to-evaluate). Specifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07140v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07140v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07140v2-abstract-full" style="display: none;"> New LLM evaluation benchmarks are important to align with the rapid development of Large Language Models (LLMs). In this work, we present Chinese SimpleQA, the first comprehensive Chinese benchmark to evaluate the factuality ability of language models to answer short questions, and Chinese SimpleQA mainly has five properties (i.e., Chinese, Diverse, High-quality, Static, Easy-to-evaluate). Specifically, first, we focus on the Chinese language over 6 major topics with 99 diverse subtopics. Second, we conduct a comprehensive quality control process to achieve high-quality questions and answers, where the reference answers are static and cannot be changed over time. Third, following SimpleQA, the questions and answers are very short, and the grading process is easy-to-evaluate based on OpenAI API. Based on Chinese SimpleQA, we perform a comprehensive evaluation on the factuality abilities of existing LLMs. Finally, we hope that Chinese SimpleQA could guide the developers to better understand the Chinese factuality abilities of their models and facilitate the growth of foundation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07140v2-abstract-full').style.display = 'none'; document.getElementById('2411.07140v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06746">arXiv:2411.06746</a> <span> [<a href="https://arxiv.org/pdf/2411.06746">pdf</a>, <a href="https://arxiv.org/format/2411.06746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neuromodulated Meta-Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyao Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Huijie Guo</a>, <a href="/search/cs?searchtype=author&query=Qiang%2C+W">Wenwen Qiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiangmeng Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Changwen Zheng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+G">Gang Hua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06746v1-abstract-short" style="display: inline;"> Humans excel at adapting perceptions and actions to diverse environments, enabling efficient interaction with the external world. This adaptive capability relies on the biological nervous system (BNS), which activates different brain regions for distinct tasks. Meta-learning similarly trains machines to handle multiple tasks but relies on a fixed network structure, not as flexible as BNS. To inves… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06746v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06746v1-abstract-full" style="display: none;"> Humans excel at adapting perceptions and actions to diverse environments, enabling efficient interaction with the external world. This adaptive capability relies on the biological nervous system (BNS), which activates different brain regions for distinct tasks. Meta-learning similarly trains machines to handle multiple tasks but relies on a fixed network structure, not as flexible as BNS. To investigate the role of flexible network structure (FNS) in meta-learning, we conduct extensive empirical and theoretical analyses, finding that model performance is tied to structure, with no universally optimal pattern across tasks. This reveals the crucial role of FNS in meta-learning, ensuring meta-learning to generate the optimal structure for each task, thereby maximizing the performance and learning efficiency of meta-learning. Motivated by this insight, we propose to define, measure, and model FNS in meta-learning. First, we define that an effective FNS should possess frugality, plasticity, and sensitivity. Then, to quantify FNS in practice, we present three measurements for these properties, collectively forming the \emph{structure constraint} with theoretical supports. Building on this, we finally propose Neuromodulated Meta-Learning (NeuronML) to model FNS in meta-learning. It utilizes bi-level optimization to update both weights and structure with the structure constraint. Extensive theoretical and empirical evaluations demonstrate the effectiveness of NeuronML on various tasks. Code is publicly available at \href{https://github.com/WangJingyao07/NeuronML}{https://github.com/WangJingyao07/NeuronML}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06746v1-abstract-full').style.display = 'none'; document.getElementById('2411.06746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06377">arXiv:2411.06377</a> <span> [<a href="https://arxiv.org/pdf/2411.06377">pdf</a>, <a href="https://arxiv.org/format/2411.06377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SymmeTac: Symmetric Color LED Driven Efficient Photometric Stereo Reconstruction Methods for Camera-based Tactile Sensors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jieji Ren</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Heng Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zaiyan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jinnuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yueshi Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningbin Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Boxin Shi</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">Jiang Zou</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+G">Guoying Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06377v1-abstract-short" style="display: inline;"> Camera-based tactile sensors can provide high-density surface geometry and force information for robots in the interaction process with the target. However, most existing methods cannot achieve accurate reconstruction with high efficiency, impeding the applications in robots. To address these problems, we propose an efficient two-shot photometric stereo method based on symmetric color LED distribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06377v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06377v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06377v1-abstract-full" style="display: none;"> Camera-based tactile sensors can provide high-density surface geometry and force information for robots in the interaction process with the target. However, most existing methods cannot achieve accurate reconstruction with high efficiency, impeding the applications in robots. To address these problems, we propose an efficient two-shot photometric stereo method based on symmetric color LED distribution. Specifically, based on the sensing response curve of CMOS channels, we design orthogonal red and blue LEDs as illumination to acquire four observation maps using channel-splitting in a two-shot manner. Subsequently, we develop a two-shot photometric stereo theory, which can estimate accurate surface normal and greatly reduce the computing overhead in magnitude. Finally, leveraging the characteristics of the camera-based tactile sensor, we optimize the algorithm to be a highly efficient, pure addition operation. Simulation and real-world experiments demonstrate the advantages of our approach. Further details are available on: https://github.com/Tacxels/SymmeTac. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06377v1-abstract-full').style.display = 'none'; document.getElementById('2411.06377v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06171">arXiv:2411.06171</a> <span> [<a href="https://arxiv.org/pdf/2411.06171">pdf</a>, <a href="https://arxiv.org/format/2411.06171">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SEEKR: Selective Attention-Guided Knowledge Retention for Continual Learning of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Jinghan He</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haiyun Guo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zihan Zhao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+M">Ming Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinqiao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06171v1-abstract-short" style="display: inline;"> Continual learning (CL) is crucial for language models to dynamically adapt to the evolving real-world demands. To mitigate the catastrophic forgetting problem in CL, data replay has been proven a simple and effective strategy, and the subsequent data-replay-based distillation can further enhance the performance. However, existing methods fail to fully exploit the knowledge embedded in models from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06171v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06171v1-abstract-full" style="display: none;"> Continual learning (CL) is crucial for language models to dynamically adapt to the evolving real-world demands. To mitigate the catastrophic forgetting problem in CL, data replay has been proven a simple and effective strategy, and the subsequent data-replay-based distillation can further enhance the performance. However, existing methods fail to fully exploit the knowledge embedded in models from previous tasks, resulting in the need for a relatively large number of replay samples to achieve good results. In this work, we first explore and emphasize the importance of attention weights in knowledge retention, and then propose a SElective attEntion-guided Knowledge Retention method (SEEKR) for data-efficient replay-based continual learning of large language models (LLMs). Specifically, SEEKR performs attention distillation on the selected attention heads for finer-grained knowledge retention, where the proposed forgettability-based and task-sensitivity-based measures are used to identify the most valuable attention heads. Experimental results on two continual learning benchmarks for LLMs demonstrate the superiority of SEEKR over the existing methods on both performance and efficiency. Explicitly, SEEKR achieves comparable or even better performance with only 1/10 of the replayed data used by other methods, and reduces the proportion of replayed data to 1%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06171v1-abstract-full').style.display = 'none'; document.getElementById('2411.06171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01226">arXiv:2411.01226</a> <span> [<a href="https://arxiv.org/pdf/2411.01226">pdf</a>, <a href="https://arxiv.org/format/2411.01226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> MonoPlane: Exploiting Monocular Geometric Cues for Generalizable 3D Plane Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wang Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiachen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yishu Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sili Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S+X">Sharon X Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong-Jin Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hengkai Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01226v1-abstract-short" style="display: inline;"> This paper presents a generalizable 3D plane detection and reconstruction framework named MonoPlane. Unlike previous robust estimator-based works (which require multiple images or RGB-D input) and learning-based works (which suffer from domain shift), MonoPlane combines the best of two worlds and establishes a plane reconstruction pipeline based on monocular geometric cues, resulting in accurate,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01226v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01226v1-abstract-full" style="display: none;"> This paper presents a generalizable 3D plane detection and reconstruction framework named MonoPlane. Unlike previous robust estimator-based works (which require multiple images or RGB-D input) and learning-based works (which suffer from domain shift), MonoPlane combines the best of two worlds and establishes a plane reconstruction pipeline based on monocular geometric cues, resulting in accurate, robust and scalable 3D plane detection and reconstruction in the wild. Specifically, we first leverage large-scale pre-trained neural networks to obtain the depth and surface normals from a single image. These monocular geometric cues are then incorporated into a proximity-guided RANSAC framework to sequentially fit each plane instance. We exploit effective 3D point proximity and model such proximity via a graph within RANSAC to guide the plane fitting from noisy monocular depths, followed by image-level multi-plane joint optimization to improve the consistency among all plane instances. We further design a simple but effective pipeline to extend this single-view solution to sparse-view 3D plane reconstruction. Extensive experiments on a list of datasets demonstrate our superior zero-shot generalizability over baselines, achieving state-of-the-art plane reconstruction performance in a transferring setting. Our code is available at https://github.com/thuzhaowang/MonoPlane . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01226v1-abstract-full').style.display = 'none'; document.getElementById('2411.01226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IROS 2024 (oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00761">arXiv:2411.00761</a> <span> [<a href="https://arxiv.org/pdf/2411.00761">pdf</a>, <a href="https://arxiv.org/format/2411.00761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> LCP: Enhancing Scientific Data Management with Lossy Compression for Particles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Longtao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+C">Congrong Ren</a>, <a href="/search/cs?searchtype=author&query=Di%2C+S">Sheng Di</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinyang Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiajun Huang</a>, <a href="/search/cs?searchtype=author&query=Underwood%2C+R">Robert Underwood</a>, <a href="/search/cs?searchtype=author&query=Grosset%2C+P">Pascal Grosset</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dingwen Tao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xin Liang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanqi Guo</a>, <a href="/search/cs?searchtype=author&query=Capello%2C+F">Franck Capello</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kai Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00761v1-abstract-short" style="display: inline;"> Many scientific applications opt for particles instead of meshes as their basic primitives to model complex systems composed of billions of discrete entities. Such applications span a diverse array of scientific domains, including molecular dynamics, cosmology, computational fluid dynamics, and geology. The scale of the particles in those scientific applications increases substantially thanks to t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00761v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00761v1-abstract-full" style="display: none;"> Many scientific applications opt for particles instead of meshes as their basic primitives to model complex systems composed of billions of discrete entities. Such applications span a diverse array of scientific domains, including molecular dynamics, cosmology, computational fluid dynamics, and geology. The scale of the particles in those scientific applications increases substantially thanks to the ever-increasing computational power in high-performance computing (HPC) platforms. However, the actual gains from such increases are often undercut by obstacles in data management systems related to data storage, transfer, and processing. Lossy compression has been widely recognized as a promising solution to enhance scientific data management systems regarding such challenges, although most existing compression solutions are tailored for Cartesian grids and thus have sub-optimal results on discrete particle data. In this paper, we introduce LCP, an innovative lossy compressor designed for particle datasets, offering superior compression quality and higher speed than existing compression solutions. Specifically, our contribution is threefold. (1) We propose LCP-S, an error-bound aware block-wise spatial compressor to efficiently reduce particle data size. This approach is universally applicable to particle data across various domains. (2) We develop LCP, a hybrid compression solution for multi-frame particle data, featuring dynamic method selection and parameter optimization. (3) We evaluate our solution alongside eight state-of-the-art alternatives on eight real-world particle datasets from seven distinct domains. The results demonstrate that our solution achieves up to 104% improvement in compression ratios and up to 593% increase in speed compared to the second-best option, under the same error criteria. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00761v1-abstract-full').style.display = 'none'; document.getElementById('2411.00761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SIGMOD'25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00625">arXiv:2411.00625</a> <span> [<a href="https://arxiv.org/pdf/2411.00625">pdf</a>, <a href="https://arxiv.org/format/2411.00625">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Toward Automated Algorithm Design: A Survey and Practical Guide to Meta-Black-Box-Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zeyuan Ma</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongshu Guo</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yue-Jiao Gong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K+C">Kay Chen Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00625v2-abstract-short" style="display: inline;"> In this survey, we introduce Meta-Black-Box-Optimization~(MetaBBO) as an emerging avenue within the Evolutionary Computation~(EC) community, which incorporates Meta-learning approaches to assist automated algorithm design. Despite the success of MetaBBO, the current literature provides insufficient summaries of its key aspects and lacks practical guidance for implementation. To bridge this gap, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00625v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00625v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00625v2-abstract-full" style="display: none;"> In this survey, we introduce Meta-Black-Box-Optimization~(MetaBBO) as an emerging avenue within the Evolutionary Computation~(EC) community, which incorporates Meta-learning approaches to assist automated algorithm design. Despite the success of MetaBBO, the current literature provides insufficient summaries of its key aspects and lacks practical guidance for implementation. To bridge this gap, we offer a comprehensive review of recent advances in MetaBBO, providing an in-depth examination of its key developments. We begin with a unified definition of the MetaBBO paradigm, followed by a systematic taxonomy of various algorithm design tasks, including algorithm selection, algorithm configuration, solution manipulation, and algorithm generation. Further, we conceptually summarize different learning methodologies behind current MetaBBO works, including reinforcement learning, supervised learning, neuroevolution, and in-context learning with Large Language Models. A comprehensive evaluation of the latest representative MetaBBO methods is then carried out, alongside an experimental analysis of their optimization performance, computational efficiency, and generalization ability. Based on the evaluation results, we meticulously identify a set of core designs that enhance the generalization and learning effectiveness of MetaBBO. Finally, we outline the vision for the field by providing insight into the latest trends and potential future directions. Relevant literature will be continuously collected and updated at \url{https://github.com/GMC-DRL/Awesome-MetaBBO}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00625v2-abstract-full').style.display = 'none'; document.getElementById('2411.00625v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23225">arXiv:2410.23225</a> <span> [<a href="https://arxiv.org/pdf/2410.23225">pdf</a>, <a href="https://arxiv.org/ps/2410.23225">ps</a>, <a href="https://arxiv.org/format/2410.23225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> Deterministic counting from coupling independence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Weiming Feng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Heng Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Z">Zongrui Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23225v1-abstract-short" style="display: inline;"> We show that spin systems with bounded degrees and coupling independence admit fully polynomial time approximation schemes (FPTAS). We design a new recursive deterministic counting algorithm to achieve this. As applications, we give the first FPTASes for $q$-colourings on graphs of bounded maximum degree $螖\ge 3$, when $q\ge (11/6-\varepsilon_0)螖$ for some small $\varepsilon_0\approx 10^{-5}$, or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23225v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23225v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23225v1-abstract-full" style="display: none;"> We show that spin systems with bounded degrees and coupling independence admit fully polynomial time approximation schemes (FPTAS). We design a new recursive deterministic counting algorithm to achieve this. As applications, we give the first FPTASes for $q$-colourings on graphs of bounded maximum degree $螖\ge 3$, when $q\ge (11/6-\varepsilon_0)螖$ for some small $\varepsilon_0\approx 10^{-5}$, or when $螖\ge 125$ and $q\ge 1.809螖$, and on graphs with sufficiently large (but constant) girth, when $q\geq螖+3$. These bounds match the current best randomised approximate counting algorithms by Chen, Delcourt, Moitra, Perarnau, and Postle (2019), Carlson and Vigoda (2024), and Chen, Liu, Mani, and Moitra (2023), respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23225v1-abstract-full').style.display = 'none'; document.getElementById('2410.23225v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21759">arXiv:2410.21759</a> <span> [<a href="https://arxiv.org/pdf/2410.21759">pdf</a>, <a href="https://arxiv.org/format/2410.21759">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IntLoRA: Integral Low-rank Adaptation of Quantized Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hang Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yawei Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a>, <a href="/search/cs?searchtype=author&query=Benini%2C+L">Luca Benini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21759v2-abstract-short" style="display: inline;"> Fine-tuning large-scale text-to-image diffusion models for various downstream tasks has yielded impressive results. However, the heavy computational burdens of tuning large models prevent personal customization. Recent advances have attempted to employ parameter-efficient fine-tuning (PEFT) techniques to adapt the floating-point (FP) or quantized pre-trained weights. Nonetheless, the adaptation pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21759v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21759v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21759v2-abstract-full" style="display: none;"> Fine-tuning large-scale text-to-image diffusion models for various downstream tasks has yielded impressive results. However, the heavy computational burdens of tuning large models prevent personal customization. Recent advances have attempted to employ parameter-efficient fine-tuning (PEFT) techniques to adapt the floating-point (FP) or quantized pre-trained weights. Nonetheless, the adaptation parameters in existing works are still restricted to FP arithmetic, hindering hardware-friendly acceleration. In this work, we propose IntLoRA, to further push the efficiency limits by using integer type (INT) low-rank parameters to adapt the quantized diffusion models. By working in the integer arithmetic, our IntLoRA offers three key advantages: (i) for fine-tuning, the pre-trained weights are quantized, reducing memory usage; (ii) for storage, both pre-trained and low-rank weights are in INT which consumes less disk space; (iii) for inference, IntLoRA weights can be naturally merged into quantized pre-trained weights through efficient integer multiplication or bit-shifting, eliminating additional post-training quantization. Extensive experiments demonstrate that IntLoRA can achieve performance on par with or even superior to the vanilla LoRA, accompanied by significant efficiency improvements. Code is available at \url{https://github.com/csguoh/IntLoRA}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21759v2-abstract-full').style.display = 'none'; document.getElementById('2410.21759v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21487">arXiv:2410.21487</a> <span> [<a href="https://arxiv.org/pdf/2410.21487">pdf</a>, <a href="https://arxiv.org/format/2410.21487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3627673.3679849">10.1145/3627673.3679849 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing CTR Prediction in Recommendation Domain with Search Query Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuening Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Man Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yaochen Hu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+W">Wei Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yingxue Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Huifeng Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Coates%2C+M">Mark Coates</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21487v1-abstract-short" style="display: inline;"> Many platforms, such as e-commerce websites, offer both search and recommendation services simultaneously to better meet users' diverse needs. Recommendation services suggest items based on user preferences, while search services allow users to search for items before providing recommendations. Since users and items are often shared between the search and recommendation domains, there is a valuabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21487v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21487v1-abstract-full" style="display: none;"> Many platforms, such as e-commerce websites, offer both search and recommendation services simultaneously to better meet users' diverse needs. Recommendation services suggest items based on user preferences, while search services allow users to search for items before providing recommendations. Since users and items are often shared between the search and recommendation domains, there is a valuable opportunity to enhance the recommendation domain by leveraging user preferences extracted from the search domain. Existing approaches either overlook the shift in user intention between these domains or fail to capture the significant impact of learning from users' search queries on understanding their interests. In this paper, we propose a framework that learns from user search query embeddings within the context of user preferences in the recommendation domain. Specifically, user search query sequences from the search domain are used to predict the items users will click at the next time point in the recommendation domain. Additionally, the relationship between queries and items is explored through contrastive learning. To address issues of data sparsity, the diffusion model is incorporated to infer positive items the user will select after searching with certain queries in a denoising manner, which is particularly effective in preventing false positives. Effectively extracting this information, the queries are integrated into click-through rate prediction in the recommendation domain. Experimental analysis demonstrates that our model outperforms state-of-the-art models in the recommendation domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21487v1-abstract-full').style.display = 'none'; document.getElementById('2410.21487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CIKM 2024 Full Research Track</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> CIKM (2024) 2462-2471 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20824">arXiv:2410.20824</a> <span> [<a href="https://arxiv.org/pdf/2410.20824">pdf</a>, <a href="https://arxiv.org/format/2410.20824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FreqMark: Invisible Image Watermarking via Frequency Based Optimization in Latent Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yiyang Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruizhe Li</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+M">Mude Hui</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanzhong Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+C">Chuangjian Cai</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+L">Le Wan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shangfei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20824v1-abstract-short" style="display: inline;"> Invisible watermarking is essential for safeguarding digital content, enabling copyright protection and content authentication. However, existing watermarking methods fall short in robustness against regeneration attacks. In this paper, we propose a novel method called FreqMark that involves unconstrained optimization of the image latent frequency space obtained after VAE encoding. Specifically, F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20824v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20824v1-abstract-full" style="display: none;"> Invisible watermarking is essential for safeguarding digital content, enabling copyright protection and content authentication. However, existing watermarking methods fall short in robustness against regeneration attacks. In this paper, we propose a novel method called FreqMark that involves unconstrained optimization of the image latent frequency space obtained after VAE encoding. Specifically, FreqMark embeds the watermark by optimizing the latent frequency space of the images and then extracts the watermark through a pre-trained image encoder. This optimization allows a flexible trade-off between image quality with watermark robustness and effectively resists regeneration attacks. Experimental results demonstrate that FreqMark offers significant advantages in image quality and robustness, permits flexible selection of the encoding bit number, and achieves a bit accuracy exceeding 90% when encoding a 48-bit hidden message under various attack scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20824v1-abstract-full').style.display = 'none'; document.getElementById('2410.20824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20789">arXiv:2410.20789</a> <span> [<a href="https://arxiv.org/pdf/2410.20789">pdf</a>, <a href="https://arxiv.org/format/2410.20789">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> LoDAvatar: Hierarchical Embedding and Adaptive Levels of Detail with Gaussian Splatting for Enhanced Human Avatars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dongye%2C+X">Xiaonuo Dongye</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanzhi Guo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Le Luo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haiyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yihua Bao</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Z">Zeyu Tian</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+D">Dongdong Weng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20789v1-abstract-short" style="display: inline;"> With the advancement of virtual reality, the demand for 3D human avatars is increasing. The emergence of Gaussian Splatting technology has enabled the rendering of Gaussian avatars with superior visual quality and reduced computational costs. Despite numerous methods researchers propose for implementing drivable Gaussian avatars, limited attention has been given to balancing visual quality and com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20789v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20789v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20789v1-abstract-full" style="display: none;"> With the advancement of virtual reality, the demand for 3D human avatars is increasing. The emergence of Gaussian Splatting technology has enabled the rendering of Gaussian avatars with superior visual quality and reduced computational costs. Despite numerous methods researchers propose for implementing drivable Gaussian avatars, limited attention has been given to balancing visual quality and computational costs. In this paper, we introduce LoDAvatar, a method that introduces levels of detail into Gaussian avatars through hierarchical embedding and selective detail enhancement methods. The key steps of LoDAvatar encompass data preparation, Gaussian embedding, Gaussian optimization, and selective detail enhancement. We conducted experiments involving Gaussian avatars at various levels of detail, employing both objective assessments and subjective evaluations. The outcomes indicate that incorporating levels of detail into Gaussian avatars can decrease computational costs during rendering while upholding commendable visual quality, thereby enhancing runtime frame rates. We advocate adopting LoDAvatar to render multiple dynamic Gaussian avatars or extensive Gaussian scenes to balance visual quality and computational costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20789v1-abstract-full').style.display = 'none'; document.getElementById('2410.20789v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 7 figures, submitted to IEEE VR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19933">arXiv:2410.19933</a> <span> [<a href="https://arxiv.org/pdf/2410.19933">pdf</a>, <a href="https://arxiv.org/format/2410.19933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Safety in Reinforcement Learning with Human Feedback via Rectified Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiyue Peng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hengquan Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+D">Dongqing Zou</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Ziyu Shao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Honghao Wei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19933v1-abstract-short" style="display: inline;"> Balancing helpfulness and safety (harmlessness) is a critical challenge in aligning large language models (LLMs). Current approaches often decouple these two objectives, training separate preference models for helpfulness and safety, while framing safety as a constraint within a constrained Markov Decision Process (CMDP) framework. However, these methods can lead to ``safety interference'', where… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19933v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19933v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19933v1-abstract-full" style="display: none;"> Balancing helpfulness and safety (harmlessness) is a critical challenge in aligning large language models (LLMs). Current approaches often decouple these two objectives, training separate preference models for helpfulness and safety, while framing safety as a constraint within a constrained Markov Decision Process (CMDP) framework. However, these methods can lead to ``safety interference'', where average-based safety constraints compromise the safety of some prompts in favor of others. To address this issue, we propose \textbf{Rectified Policy Optimization (RePO)}, which replaces the average safety constraint with stricter (per prompt) safety constraints. At the core of RePO is a policy update mechanism driven by rectified policy gradients, which penalizes the strict safety violation of every prompt, thereby enhancing safety across nearly all prompts. Our experiments on Alpaca-7B demonstrate that RePO improves the safety alignment and reduces the safety interference compared to baseline methods. Code is available at https://github.com/pxyWaterMoon/RePO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19933v1-abstract-full').style.display = 'none'; document.getElementById('2410.19933v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19720">arXiv:2410.19720</a> <span> [<a href="https://arxiv.org/pdf/2410.19720">pdf</a>, <a href="https://arxiv.org/format/2410.19720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> 2D-DPO: Scaling Direct Preference Optimization with 2-Dimensional Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shilong Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yancheng He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hui Huang</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+X">Xingyuan Bu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weixun Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jihao Gu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wenbo Su</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19720v1-abstract-short" style="display: inline;"> Recent advancements in Direct Preference Optimization (DPO) have significantly enhanced the alignment of Large Language Models (LLMs) with human preferences, owing to its simplicity and effectiveness. However, existing methods typically optimize a scalar score or ranking reward, thereby overlooking the multi-dimensional nature of human preferences. In this work, we propose to extend the preference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19720v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19720v1-abstract-full" style="display: none;"> Recent advancements in Direct Preference Optimization (DPO) have significantly enhanced the alignment of Large Language Models (LLMs) with human preferences, owing to its simplicity and effectiveness. However, existing methods typically optimize a scalar score or ranking reward, thereby overlooking the multi-dimensional nature of human preferences. In this work, we propose to extend the preference of DPO to two dimensions: segments and aspects. We first introduce a 2D supervision dataset called HelpSteer-2D. For the segment dimension, we divide the response into sentences and assign scores to each segment. For the aspect dimension, we meticulously design several criteria covering the response quality rubrics. With the 2-dimensional signals as feedback, we develop a 2D-DPO framework, decomposing the overall objective into multi-segment and multi-aspect objectives. Extensive experiments on popular benchmarks demonstrate that 2D-DPO performs better than methods that optimize for scalar or 1-dimensional preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19720v1-abstract-full').style.display = 'none'; document.getElementById('2410.19720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first four authors contributed equally, 25 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19250">arXiv:2410.19250</a> <span> [<a href="https://arxiv.org/pdf/2410.19250">pdf</a>, <a href="https://arxiv.org/format/2410.19250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Reopening of Pandora's Box: Analyzing the Role of LLMs in the Evolving Battle Against AI-Generated Fake News </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Koneru%2C+S">Sai Koneru</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangzhi Guo</a>, <a href="/search/cs?searchtype=author&query=Mingole%2C+B">Bonam Mingole</a>, <a href="/search/cs?searchtype=author&query=Sundar%2C+S+S">S. Shyam Sundar</a>, <a href="/search/cs?searchtype=author&query=Rajtmajer%2C+S">Sarah Rajtmajer</a>, <a href="/search/cs?searchtype=author&query=Yadav%2C+A">Amulya Yadav</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19250v1-abstract-short" style="display: inline;"> With the rise of AI-generated content spewed at scale from large language models (LLMs), genuine concerns about the spread of fake news have intensified. The perceived ability of LLMs to produce convincing fake news at scale poses new challenges for both human and automated fake news detection systems. To address this gap, this work presents the findings from a university-level competition which a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19250v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19250v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19250v1-abstract-full" style="display: none;"> With the rise of AI-generated content spewed at scale from large language models (LLMs), genuine concerns about the spread of fake news have intensified. The perceived ability of LLMs to produce convincing fake news at scale poses new challenges for both human and automated fake news detection systems. To address this gap, this work presents the findings from a university-level competition which aimed to explore how LLMs can be used by humans to create fake news, and to assess the ability of human annotators and AI models to detect it. A total of 110 participants used LLMs to create 252 unique fake news stories, and 84 annotators participated in the detection tasks. Our findings indicate that LLMs are ~68% more effective at detecting real news than humans. However, for fake news detection, the performance of LLMs and humans remains comparable (~60% accuracy). Additionally, we examine the impact of visual elements (e.g., pictures) in news on the accuracy of detecting fake news stories. Finally, we also examine various strategies used by fake news creators to enhance the credibility of their AI-generated content. This work highlights the increasing complexity of detecting AI-generated fake news, particularly in collaborative human-AI settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19250v1-abstract-full').style.display = 'none'; document.getElementById('2410.19250v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18403">arXiv:2410.18403</a> <span> [<a href="https://arxiv.org/pdf/2410.18403">pdf</a>, <a href="https://arxiv.org/format/2410.18403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Structure Language Models for Protein Conformation Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiarui Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoyin Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S+Z">Stephen Zhewen Lu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+C">Chence Shi</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongyu Guo</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18403v1-abstract-short" style="display: inline;"> Proteins adopt multiple structural conformations to perform their diverse biological functions, and understanding these conformations is crucial for advancing drug discovery. Traditional physics-based simulation methods often struggle with sampling equilibrium conformations and are computationally expensive. Recently, deep generative models have shown promise in generating protein conformations as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18403v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18403v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18403v1-abstract-full" style="display: none;"> Proteins adopt multiple structural conformations to perform their diverse biological functions, and understanding these conformations is crucial for advancing drug discovery. Traditional physics-based simulation methods often struggle with sampling equilibrium conformations and are computationally expensive. Recently, deep generative models have shown promise in generating protein conformations as a more efficient alternative. However, these methods predominantly rely on the diffusion process within a 3D geometric space, which typically centers around the vicinity of metastable states and is often inefficient in terms of runtime. In this paper, we introduce Structure Language Modeling (SLM) as a novel framework for efficient protein conformation generation. Specifically, the protein structures are first encoded into a compact latent space using a discrete variational auto-encoder, followed by conditional language modeling that effectively captures sequence-specific conformation distributions. This enables a more efficient and interpretable exploration of diverse ensemble modes compared to existing methods. Based on this general framework, we instantiate SLM with various popular LM architectures as well as proposing the ESMDiff, a novel BERT-like structure language model fine-tuned from ESM3 with masked diffusion. We verify our approach in various scenarios, including the equilibrium dynamics of BPTI, conformational change pairs, and intrinsically disordered proteins. SLM provides a highly efficient solution, offering a 20-100x speedup than existing methods in generating diverse conformations, shedding light on promising avenues for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18403v1-abstract-full').style.display = 'none'; document.getElementById('2410.18403v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15636">arXiv:2410.15636</a> <span> [<a href="https://arxiv.org/pdf/2410.15636">pdf</a>, <a href="https://arxiv.org/format/2410.15636">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LucidFusion: Generating 3D Gaussians with Arbitrary Unposed Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+H">Hao He</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yixun Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luozhou Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuanhao Cai</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinli Xu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hao-Xiang Guo</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+X">Xiang Wen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingcong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15636v2-abstract-short" style="display: inline;"> Recent large reconstruction models have made notable progress in generating high-quality 3D objects from single images. However, these methods often struggle with controllability, as they lack information from multiple views, leading to incomplete or inconsistent 3D reconstructions. To address this limitation, we introduce LucidFusion, a flexible end-to-end feed-forward framework that leverages th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15636v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15636v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15636v2-abstract-full" style="display: none;"> Recent large reconstruction models have made notable progress in generating high-quality 3D objects from single images. However, these methods often struggle with controllability, as they lack information from multiple views, leading to incomplete or inconsistent 3D reconstructions. To address this limitation, we introduce LucidFusion, a flexible end-to-end feed-forward framework that leverages the Relative Coordinate Map (RCM). Unlike traditional methods linking images to 3D world thorough pose, LucidFusion utilizes RCM to align geometric features coherently across different views, making it highly adaptable for 3D generation from arbitrary, unposed images. Furthermore, LucidFusion seamlessly integrates with the original single-image-to-3D pipeline, producing detailed 3D Gaussians at a resolution of $512 \times 512$, making it well-suited for a wide range of applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15636v2-abstract-full').style.display = 'none'; document.getElementById('2410.15636v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 12 figures, [project page](https://heye0507.github.io/LucidFusion_page/)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15467">arXiv:2410.15467</a> <span> [<a href="https://arxiv.org/pdf/2410.15467">pdf</a>, <a href="https://arxiv.org/format/2410.15467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Hey GPT, Can You be More Racist? Analysis from Crowdsourced Attempts to Elicit Biased Content from Generative AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangzhi Guo</a>, <a href="/search/cs?searchtype=author&query=Venkit%2C+P+N">Pranav Narayanan Venkit</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+E">Eunchae Jang</a>, <a href="/search/cs?searchtype=author&query=Srinath%2C+M">Mukund Srinath</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Mingole%2C+B">Bonam Mingole</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+V">Vipul Gupta</a>, <a href="/search/cs?searchtype=author&query=Varshney%2C+K+R">Kush R. Varshney</a>, <a href="/search/cs?searchtype=author&query=Sundar%2C+S+S">S. Shyam Sundar</a>, <a href="/search/cs?searchtype=author&query=Yadav%2C+A">Amulya Yadav</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15467v1-abstract-short" style="display: inline;"> The widespread adoption of large language models (LLMs) and generative AI (GenAI) tools across diverse applications has amplified the importance of addressing societal biases inherent within these technologies. While the NLP community has extensively studied LLM bias, research investigating how non-expert users perceive and interact with biases from these systems remains limited. As these technolo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15467v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15467v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15467v1-abstract-full" style="display: none;"> The widespread adoption of large language models (LLMs) and generative AI (GenAI) tools across diverse applications has amplified the importance of addressing societal biases inherent within these technologies. While the NLP community has extensively studied LLM bias, research investigating how non-expert users perceive and interact with biases from these systems remains limited. As these technologies become increasingly prevalent, understanding this question is crucial to inform model developers in their efforts to mitigate bias. To address this gap, this work presents the findings from a university-level competition, which challenged participants to design prompts for eliciting biased outputs from GenAI tools. We quantitatively and qualitatively analyze the competition submissions and identify a diverse set of biases in GenAI and strategies employed by participants to induce bias in GenAI. Our finding provides unique insights into how non-expert users perceive and interact with biases from GenAI tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15467v1-abstract-full').style.display = 'none'; document.getElementById('2410.15467v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15430">arXiv:2410.15430</a> <span> [<a href="https://arxiv.org/pdf/2410.15430">pdf</a>, <a href="https://arxiv.org/format/2410.15430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BoostAdapter: Improving Vision-Language Test-Time Adaptation via Regional Bootstrapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Taolin Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinpeng Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hang Guo</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15430v2-abstract-short" style="display: inline;"> Adaptation of pretrained vision-language models such as CLIP to various downstream tasks have raised great interest in recent researches. Previous works have proposed a variety of test-time adaptation (TTA) methods to achieve strong generalization without any knowledge of the target domain. However, existing training-required TTA approaches like TPT necessitate entropy minimization that involves l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15430v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15430v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15430v2-abstract-full" style="display: none;"> Adaptation of pretrained vision-language models such as CLIP to various downstream tasks have raised great interest in recent researches. Previous works have proposed a variety of test-time adaptation (TTA) methods to achieve strong generalization without any knowledge of the target domain. However, existing training-required TTA approaches like TPT necessitate entropy minimization that involves large computational overhead, while training-free methods like TDA overlook the potential for information mining from the test samples themselves. In this paper, we break down the design of existing popular training-required and training-free TTA methods and bridge the gap between them within our framework. Specifically, we maintain a light-weight key-value memory for feature retrieval from instance-agnostic historical samples and instance-aware boosting samples. The historical samples are filtered from the testing data stream and serve to extract useful information from the target distribution, while the boosting samples are drawn from regional bootstrapping and capture the knowledge of the test sample itself. We theoretically justify the rationality behind our method and empirically verify its effectiveness on both the out-of-distribution and the cross-domain datasets, showcasing its applicability in real-world situations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15430v2-abstract-full').style.display = 'none'; document.getElementById('2410.15430v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14620">arXiv:2410.14620</a> <span> [<a href="https://arxiv.org/pdf/2410.14620">pdf</a>, <a href="https://arxiv.org/format/2410.14620">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Site-Specific Outdoor Propagation Assessment and Ray-Tracing Analysis for Wireless Digital Twins </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aram%2C+M+G">Morteza Ghaderi Aram</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hao Guo</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Mingsheng Yin</a>, <a href="/search/cs?searchtype=author&query=Svensson%2C+T">Tommy Svensson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14620v1-abstract-short" style="display: inline;"> Digital twinning is becoming increasingly vital in the design and real-time control of future wireless networks by providing precise cost-effective simulations, predictive insights, and real-time data integration. This paper explores the application of digital twinning in optimizing wireless communication systems within urban environments, where building arrangements can critically impact network… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14620v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14620v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14620v1-abstract-full" style="display: none;"> Digital twinning is becoming increasingly vital in the design and real-time control of future wireless networks by providing precise cost-effective simulations, predictive insights, and real-time data integration. This paper explores the application of digital twinning in optimizing wireless communication systems within urban environments, where building arrangements can critically impact network performances. We develop a digital twin platform to simulate and analyze how factors such as building positioning, base station placement, and antenna design influence wireless propagation. The ray-tracing software package of Matlab is compared with Remcom Wireless InSite. Using a realistic radiation pattern of a base transceiver station (BTS) antenna, ray tracing simulations for signal propagation and interactions in urban landscapes are then extensively examined. By analyzing radio heat maps alongside antenna patterns, we gain valuable insights into optimizing wireless deployment strategies. This study highlights the potential of digital twinning as a critical tool for urban planners and network engineers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14620v1-abstract-full').style.display = 'none'; document.getElementById('2410.14620v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13413">arXiv:2410.13413</a> <span> [<a href="https://arxiv.org/pdf/2410.13413">pdf</a>, <a href="https://arxiv.org/format/2410.13413">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Think Thrice Before You Act: Progressive Thought Refinement in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+C">Chengyu Du</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jinyi Han</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+Y">Yizhou Ying</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Aili Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qianyu He</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haokun Zhao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Sirui Xia</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haoran Guo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jiaqing Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zulong Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liangyue Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yanghua Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13413v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have demonstrated that progressive refinement, rather than providing a single answer, results in more accurate and thoughtful outputs. However, existing methods often rely heavily on supervision signals to evaluate previous responses, making it difficult to assess output quality in more open-ended scenarios effectively. Additionally, these method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13413v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13413v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13413v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have demonstrated that progressive refinement, rather than providing a single answer, results in more accurate and thoughtful outputs. However, existing methods often rely heavily on supervision signals to evaluate previous responses, making it difficult to assess output quality in more open-ended scenarios effectively. Additionally, these methods are typically designed for specific tasks, which limits their generalization to new domains. To address these limitations, we propose Progressive Thought Refinement (PTR), a framework that enables LLMs to refine their responses progressively. PTR operates in two phases: (1) Thought data construction stage: We propose a weak and strong model collaborative selection strategy to build a high-quality progressive refinement dataset to ensure logical consistency from thought to answers, and the answers are gradually refined in each round. (2) Thought-Mask Fine-Tuning Phase: We design a training structure to mask the "thought" and adjust loss weights to encourage LLMs to refine prior thought, teaching them to implicitly understand "how to improve" rather than "what is correct." Experimental results show that PTR significantly enhances LLM performance across ten diverse tasks (avg. from 49.6% to 53.5%) without task-specific fine-tuning. Notably, in more open-ended tasks, LLMs also demonstrate substantial improvements in the quality of responses beyond mere accuracy, suggesting that PTR truly teaches LLMs to self-improve over time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13413v1-abstract-full').style.display = 'none'; document.getElementById('2410.13413v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12295">arXiv:2410.12295</a> <span> [<a href="https://arxiv.org/pdf/2410.12295">pdf</a>, <a href="https://arxiv.org/format/2410.12295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Consistency Calibration: Improving Uncertainty Calibration via Consistency among Perturbed Neighbors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+L">Linwei Tao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haolan Guo</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Minjing Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12295v1-abstract-short" style="display: inline;"> Calibration is crucial in deep learning applications, especially in fields like healthcare and autonomous driving, where accurate confidence estimates are vital for decision-making. However, deep neural networks often suffer from miscalibration, with reliability diagrams and Expected Calibration Error (ECE) being the only standard perspective for evaluating calibration performance. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12295v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12295v1-abstract-full" style="display: none;"> Calibration is crucial in deep learning applications, especially in fields like healthcare and autonomous driving, where accurate confidence estimates are vital for decision-making. However, deep neural networks often suffer from miscalibration, with reliability diagrams and Expected Calibration Error (ECE) being the only standard perspective for evaluating calibration performance. In this paper, we introduce the concept of consistency as an alternative perspective on model calibration, inspired by uncertainty estimation literature in large language models (LLMs). We highlight its advantages over the traditional reliability-based view. Building on this concept, we propose a post-hoc calibration method called Consistency Calibration (CC), which adjusts confidence based on the model's consistency across perturbed inputs. CC is particularly effective in locally uncertainty estimation, as it requires no additional data samples or label information, instead generating input perturbations directly from the source data. Moreover, we show that performing perturbations at the logit level significantly improves computational efficiency. We validate the effectiveness of CC through extensive comparisons with various post-hoc and training-time calibration methods, demonstrating state-of-the-art performance on standard datasets such as CIFAR-10, CIFAR-100, and ImageNet, as well as on long-tailed datasets like ImageNet-LT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12295v1-abstract-full').style.display = 'none'; document.getElementById('2410.12295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12169">arXiv:2410.12169</a> <span> [<a href="https://arxiv.org/pdf/2410.12169">pdf</a>, <a href="https://arxiv.org/format/2410.12169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Towards Autonomous Indoor Parking: A Globally Consistent Semantic SLAM System and A Semantic Localization Subsystem </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sha%2C+Y">Yichen Sha</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Siting Zhu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hekui Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hesheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12169v1-abstract-short" style="display: inline;"> We propose a globally consistent semantic SLAM system (GCSLAM) and a semantic-fusion localization subsystem (SF-Loc), which achieves accurate semantic mapping and robust localization in complex parking lots. Visual cameras (front-view and surround-view), IMU, and wheel encoder form the input sensor configuration of our system. The first part of our work is GCSLAM. GCSLAM introduces a novel factor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12169v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12169v1-abstract-full" style="display: none;"> We propose a globally consistent semantic SLAM system (GCSLAM) and a semantic-fusion localization subsystem (SF-Loc), which achieves accurate semantic mapping and robust localization in complex parking lots. Visual cameras (front-view and surround-view), IMU, and wheel encoder form the input sensor configuration of our system. The first part of our work is GCSLAM. GCSLAM introduces a novel factor graph for the optimization of poses and semantic map, which incorporates innovative error terms based on multi-sensor data and BEV (bird's-eye view) semantic information. Additionally, GCSLAM integrates a Global Slot Management module that stores and manages parking slot observations. SF-Loc is the second part of our work, which leverages the semantic map built by GCSLAM to conduct map-based localization. SF-Loc integrates registration results and odometry poses with a novel factor graph. Our system demonstrates superior performance over existing SLAM on two real-world datasets, showing excellent capabilities in robust global localization and precise semantic mapping. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12169v1-abstract-full').style.display = 'none'; document.getElementById('2410.12169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11710">arXiv:2410.11710</a> <span> [<a href="https://arxiv.org/pdf/2410.11710">pdf</a>, <a href="https://arxiv.org/format/2410.11710">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MTU-Bench: A Multi-granularity Tool-Use Benchmark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pei Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yanan Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaoshuai Song</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhongyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+K">Ken Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiakai Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Junran Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wenbo Su</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11710v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have displayed massive improvements in reasoning and decision-making skills and can hold natural conversations with users. Recently, many tool-use benchmark datasets have been proposed. However, existing datasets have the following limitations: (1). Insufficient evaluation scenarios (e.g., only cover limited tool-use scenes). (2). Extensive evaluation costs (e.g., GPT… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11710v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11710v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11710v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have displayed massive improvements in reasoning and decision-making skills and can hold natural conversations with users. Recently, many tool-use benchmark datasets have been proposed. However, existing datasets have the following limitations: (1). Insufficient evaluation scenarios (e.g., only cover limited tool-use scenes). (2). Extensive evaluation costs (e.g., GPT API costs). To address these limitations, in this work, we propose a multi-granularity tool-use benchmark for large language models called MTU-Bench. For the "multi-granularity" property, our MTU-Bench covers five tool usage scenes (i.e., single-turn and single-tool, single-turn and multiple-tool, multiple-turn and single-tool, multiple-turn and multiple-tool, and out-of-distribution tasks). Besides, all evaluation metrics of our MTU-Bench are based on the prediction results and the ground truth without using any GPT or human evaluation metrics. Moreover, our MTU-Bench is collected by transforming existing high-quality datasets to simulate real-world tool usage scenarios, and we also propose an instruction dataset called MTU-Instruct data to enhance the tool-use abilities of existing LLMs. Comprehensive experimental results demonstrate the effectiveness of our MTU-Bench. Code and data will be released at https: //github.com/MTU-Bench-Team/MTU-Bench.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11710v1-abstract-full').style.display = 'none'; document.getElementById('2410.11710v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11180">arXiv:2410.11180</a> <span> [<a href="https://arxiv.org/pdf/2410.11180">pdf</a>, <a href="https://arxiv.org/format/2410.11180">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Reinforcement Learning Based Bidding Framework with High-dimensional Bids in Power Markets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinyu Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongye Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yun Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Q">Qinghu Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fuquan Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tunan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Haiwang Zhong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qixin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11180v1-abstract-short" style="display: inline;"> Over the past decade, bidding in power markets has attracted widespread attention. Reinforcement Learning (RL) has been widely used for power market bidding as a powerful AI tool to make decisions under real-world uncertainties. However, current RL methods mostly employ low dimensional bids, which significantly diverge from the N price-power pairs commonly used in the current power markets. The N-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11180v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11180v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11180v1-abstract-full" style="display: none;"> Over the past decade, bidding in power markets has attracted widespread attention. Reinforcement Learning (RL) has been widely used for power market bidding as a powerful AI tool to make decisions under real-world uncertainties. However, current RL methods mostly employ low dimensional bids, which significantly diverge from the N price-power pairs commonly used in the current power markets. The N-pair bidding format is denoted as High Dimensional Bids (HDBs), which has not been fully integrated into the existing RL-based bidding methods. The loss of flexibility in current RL bidding methods could greatly limit the bidding profits and make it difficult to tackle the rising uncertainties brought by renewable energy generations. In this paper, we intend to propose a framework to fully utilize HDBs for RL-based bidding methods. First, we employ a special type of neural network called Neural Network Supply Functions (NNSFs) to generate HDBs in the form of N price-power pairs. Second, we embed the NNSF into a Markov Decision Process (MDP) to make it compatible with most existing RL methods. Finally, experiments on Energy Storage Systems (ESSs) in the PJM Real-Time (RT) power market show that the proposed bidding method with HDBs can significantly improve bidding flexibility, thereby improving the profit of the state-of-the-art RL bidding methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11180v1-abstract-full').style.display = 'none'; document.getElementById('2410.11180v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10527">arXiv:2410.10527</a> <span> [<a href="https://arxiv.org/pdf/2410.10527">pdf</a>, <a href="https://arxiv.org/format/2410.10527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Motion-guided small MAV detection in complex and non-planar scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanqing Guo</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Canlun Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiyu Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10527v1-abstract-short" style="display: inline;"> In recent years, there has been a growing interest in the visual detection of micro aerial vehicles (MAVs) due to its importance in numerous applications. However, the existing methods based on either appearance or motion features encounter difficulties when the background is complex or the MAV is too small. In this paper, we propose a novel motion-guided MAV detector that can accurately identify… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10527v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10527v1-abstract-full" style="display: none;"> In recent years, there has been a growing interest in the visual detection of micro aerial vehicles (MAVs) due to its importance in numerous applications. However, the existing methods based on either appearance or motion features encounter difficulties when the background is complex or the MAV is too small. In this paper, we propose a novel motion-guided MAV detector that can accurately identify small MAVs in complex and non-planar scenes. This detector first exploits a motion feature enhancement module to capture the motion features of small MAVs. Then it uses multi-object tracking and trajectory filtering to eliminate false positives caused by motion parallax. Finally, an appearance-based classifier and an appearance-based detector that operates on the cropped regions are used to achieve precise detection results. Our proposed method can effectively and efficiently detect extremely small MAVs from dynamic and complex backgrounds because it aggregates pixel-level motion features and eliminates false positives based on the motion and appearance features of MAVs. Experiments on the ARD-MAV dataset demonstrate that the proposed method could achieve high performance in small MAV detection under challenging conditions and outperform other state-of-the-art methods across various metrics <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10527v1-abstract-full').style.display = 'none'; document.getElementById('2410.10527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Pattern Recognition Letters 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10267">arXiv:2410.10267</a> <span> [<a href="https://arxiv.org/pdf/2410.10267">pdf</a>, <a href="https://arxiv.org/format/2410.10267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> big.LITTLE Vision Transformer for Efficient Visual Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">He Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yulong Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zixuan Ye</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuwen Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10267v1-abstract-short" style="display: inline;"> In this paper, we introduce the big.LITTLE Vision Transformer, an innovative architecture aimed at achieving efficient visual recognition. This dual-transformer system is composed of two distinct blocks: the big performance block, characterized by its high capacity and substantial computational demands, and the LITTLE efficiency block, designed for speed with lower capacity. The key innovation of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10267v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10267v1-abstract-full" style="display: none;"> In this paper, we introduce the big.LITTLE Vision Transformer, an innovative architecture aimed at achieving efficient visual recognition. This dual-transformer system is composed of two distinct blocks: the big performance block, characterized by its high capacity and substantial computational demands, and the LITTLE efficiency block, designed for speed with lower capacity. The key innovation of our approach lies in its dynamic inference mechanism. When processing an image, our system determines the importance of each token and allocates them accordingly: essential tokens are processed by the high-performance big model, while less critical tokens are handled by the more efficient little model. This selective processing significantly reduces computational load without sacrificing the overall performance of the model, as it ensures that detailed analysis is reserved for the most important information. To validate the effectiveness of our big.LITTLE Vision Transformer, we conducted comprehensive experiments on image classification and segment anything task. Our results demonstrate that the big.LITTLE architecture not only maintains high accuracy but also achieves substantial computational savings. Specifically, our approach enables the efficient handling of large-scale visual recognition tasks by dynamically balancing the trade-offs between performance and efficiency. The success of our method underscores the potential of hybrid models in optimizing both computation and performance in visual recognition tasks, paving the way for more practical and scalable deployment of advanced neural networks in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10267v1-abstract-full').style.display = 'none'; document.getElementById('2410.10267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09886">arXiv:2410.09886</a> <span> [<a href="https://arxiv.org/pdf/2410.09886">pdf</a>, <a href="https://arxiv.org/format/2410.09886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Block-to-Scene Pre-training for Point Cloud Hybrid-Domain Masked Autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zha%2C+Y">Yaohua Zha</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanzi Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hang Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Taolin Zhang</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Z">Zhihao Ouyang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+C">Chunlin Fan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Ke Chen</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09886v1-abstract-short" style="display: inline;"> Point clouds, as a primary representation of 3D data, can be categorized into scene domain point clouds and object domain point clouds based on the modeled content. Masked autoencoders (MAE) have become the mainstream paradigm in point clouds self-supervised learning. However, existing MAE-based methods are domain-specific, limiting the model's generalization. In this paper, we propose to pre-trai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09886v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09886v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09886v1-abstract-full" style="display: none;"> Point clouds, as a primary representation of 3D data, can be categorized into scene domain point clouds and object domain point clouds based on the modeled content. Masked autoencoders (MAE) have become the mainstream paradigm in point clouds self-supervised learning. However, existing MAE-based methods are domain-specific, limiting the model's generalization. In this paper, we propose to pre-train a general Point cloud Hybrid-Domain Masked AutoEncoder (PointHDMAE) via a block-to-scene pre-training strategy. We first propose a hybrid-domain masked autoencoder consisting of an encoder and decoder belonging to the scene domain and object domain, respectively. The object domain encoder specializes in handling object point clouds and multiple shared object encoders assist the scene domain encoder in analyzing the scene point clouds. Furthermore, we propose a block-to-scene strategy to pre-train our hybrid-domain model. Specifically, we first randomly select point blocks within a scene and apply a set of transformations to convert each point block coordinates from the scene space to the object space. Then, we employ an object-level mask and reconstruction pipeline to recover the masked points of each block, enabling the object encoder to learn a universal object representation. Finally, we introduce a scene-level block position regression pipeline, which utilizes the blocks' features in the object space to regress these blocks' initial positions within the scene space, facilitating the learning of scene representations. Extensive experiments across different datasets and tasks demonstrate the generalization and superiority of our hybrid-domain model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09886v1-abstract-full').style.display = 'none'; document.getElementById('2410.09886v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06555">arXiv:2410.06555</a> <span> [<a href="https://arxiv.org/pdf/2410.06555">pdf</a>, <a href="https://arxiv.org/format/2410.06555">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ING-VP: MLLMs cannot Play Easy Vision-based Games Yet </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoran Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuyue Guo</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M">Meng Cao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenhao Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06555v1-abstract-short" style="display: inline;"> As multimodal large language models (MLLMs) continue to demonstrate increasingly competitive performance across a broad spectrum of tasks, more intricate and comprehensive benchmarks have been developed to assess these cutting-edge models. These benchmarks introduce new challenges to core capabilities such as perception, reasoning, and planning. However, existing multimodal benchmarks fall short i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06555v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06555v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06555v1-abstract-full" style="display: none;"> As multimodal large language models (MLLMs) continue to demonstrate increasingly competitive performance across a broad spectrum of tasks, more intricate and comprehensive benchmarks have been developed to assess these cutting-edge models. These benchmarks introduce new challenges to core capabilities such as perception, reasoning, and planning. However, existing multimodal benchmarks fall short in providing a focused evaluation of multi-step planning based on spatial relationships in images. To bridge this gap, we present ING-VP, the first INteractive Game-based Vision Planning benchmark, specifically designed to evaluate the spatial imagination and multi-step reasoning abilities of MLLMs. ING-VP features 6 distinct games, encompassing 300 levels, each with 6 unique configurations. A single model engages in over 60,000 rounds of interaction. The benchmark framework allows for multiple comparison settings, including image-text vs. text-only inputs, single-step vs. multi-step reasoning, and with-history vs. without-history conditions, offering valuable insights into the model's capabilities. We evaluated numerous state-of-the-art MLLMs, with the highest-performing model, Claude-3.5 Sonnet, achieving an average accuracy of only 3.37%, far below the anticipated standard. This work aims to provide a specialized evaluation framework to drive advancements in MLLMs' capacity for complex spatial reasoning and planning. The code is publicly available at https://github.com/Thisisus7/ING-VP.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06555v1-abstract-full').style.display = 'none'; document.getElementById('2410.06555v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">49 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05849">arXiv:2410.05849</a> <span> [<a href="https://arxiv.org/pdf/2410.05849">pdf</a>, <a href="https://arxiv.org/format/2410.05849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ModalPrompt:Dual-Modality Guided Prompt for Continual Learning of Large Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+F">Fanhu Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+F">Fei Zhu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haiyang Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xu-Yao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cheng-Lin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05849v1-abstract-short" style="display: inline;"> Large Multimodal Models (LMMs) exhibit remarkable multi-tasking ability by learning mixed datasets jointly. However, novel tasks would be encountered sequentially in dynamic world, and continually fine-tuning LMMs often leads to performance degrades. To handle the challenges of catastrophic forgetting, existing methods leverage data replay or model expansion, both of which are not specially develo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05849v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05849v1-abstract-full" style="display: none;"> Large Multimodal Models (LMMs) exhibit remarkable multi-tasking ability by learning mixed datasets jointly. However, novel tasks would be encountered sequentially in dynamic world, and continually fine-tuning LMMs often leads to performance degrades. To handle the challenges of catastrophic forgetting, existing methods leverage data replay or model expansion, both of which are not specially developed for LMMs and have their inherent limitations. In this paper, we propose a novel dual-modality guided prompt learning framework (ModalPrompt) tailored for multimodal continual learning to effectively learn new tasks while alleviating forgetting of previous knowledge. Concretely, we learn prototype prompts for each task and exploit efficient prompt selection for task identifiers and prompt fusion for knowledge transfer based on image-text supervision. Extensive experiments demonstrate the superiority of our approach, e.g., ModalPrompt achieves +20% performance gain on LMMs continual learning benchmarks with $\times$ 1.42 inference speed refraining from growing training cost in proportion to the number of tasks. The code will be made publically available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05849v1-abstract-full').style.display = 'none'; document.getElementById('2410.05849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05601">arXiv:2410.05601</a> <span> [<a href="https://arxiv.org/pdf/2410.05601">pdf</a>, <a href="https://arxiv.org/format/2410.05601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ReFIR: Grounding Large Restoration Models with Retrieval Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hang Guo</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Z">Zhihao Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Taolin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Y">Yaohua Zha</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05601v1-abstract-short" style="display: inline;"> Recent advances in diffusion-based Large Restoration Models (LRMs) have significantly improved photo-realistic image restoration by leveraging the internal knowledge embedded within model weights. However, existing LRMs often suffer from the hallucination dilemma, i.e., producing incorrect contents or textures when dealing with severe degradations, due to their heavy reliance on limited internal k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05601v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05601v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05601v1-abstract-full" style="display: none;"> Recent advances in diffusion-based Large Restoration Models (LRMs) have significantly improved photo-realistic image restoration by leveraging the internal knowledge embedded within model weights. However, existing LRMs often suffer from the hallucination dilemma, i.e., producing incorrect contents or textures when dealing with severe degradations, due to their heavy reliance on limited internal knowledge. In this paper, we propose an orthogonal solution called the Retrieval-augmented Framework for Image Restoration (ReFIR), which incorporates retrieved images as external knowledge to extend the knowledge boundary of existing LRMs in generating details faithful to the original scene. Specifically, we first introduce the nearest neighbor lookup to retrieve content-relevant high-quality images as reference, after which we propose the cross-image injection to modify existing LRMs to utilize high-quality textures from retrieved images. Thanks to the additional external knowledge, our ReFIR can well handle the hallucination challenge and facilitate faithfully results. Extensive experiments demonstrate that ReFIR can achieve not only high-fidelity but also realistic restoration results. Importantly, our ReFIR requires no training and is adaptable to various LRMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05601v1-abstract-full').style.display = 'none'; document.getElementById('2410.05601v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02201">arXiv:2410.02201</a> <span> [<a href="https://arxiv.org/pdf/2410.02201">pdf</a>, <a href="https://arxiv.org/format/2410.02201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Remember and Recall: Associative-Memory-based Trajectory Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hang Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuzhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tianci Gao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Junning Su</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+P">Pei Lv</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingliang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02201v1-abstract-short" style="display: inline;"> Trajectory prediction is a pivotal component of autonomous driving systems, enabling the application of accumulated movement experience to current scenarios. Although most existing methods concentrate on learning continuous representations to gain valuable experience, they often suffer from computational inefficiencies and struggle with unfamiliar situations. To address this issue, we propose the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02201v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02201v1-abstract-full" style="display: none;"> Trajectory prediction is a pivotal component of autonomous driving systems, enabling the application of accumulated movement experience to current scenarios. Although most existing methods concentrate on learning continuous representations to gain valuable experience, they often suffer from computational inefficiencies and struggle with unfamiliar situations. To address this issue, we propose the Fragmented-Memory-based Trajectory Prediction (FMTP) model, inspired by the remarkable learning capabilities of humans, particularly their ability to leverage accumulated experience and recall relevant memories in unfamiliar situations. The FMTP model employs discrete representations to enhance computational efficiency by reducing information redundancy while maintaining the flexibility to utilize past experiences. Specifically, we design a learnable memory array by consolidating continuous trajectory representations from the training set using defined quantization operations during the training phase. This approach further eliminates redundant information while preserving essential features in discrete form. Additionally, we develop an advanced reasoning engine based on language models to deeply learn the associative rules among these discrete representations. Our method has been evaluated on various public datasets, including ETH-UCY, inD, SDD, nuScenes, Waymo, and VTL-TP. The extensive experimental results demonstrate that our approach achieves significant performance and extracts more valuable experience from past trajectories to inform the current state. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02201v1-abstract-full').style.display = 'none'; document.getElementById('2410.02201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20527">arXiv:2409.20527</a> <span> [<a href="https://arxiv.org/pdf/2409.20527">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Bi-directional Momentum-based Haptic Feedback and Control System for Dexterous Telemanipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haoran Guo</a>, <a href="/search/cs?searchtype=author&query=Ba%2C+H">He Ba</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengxiong Li</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+L">Lingfeng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20527v1-abstract-short" style="display: inline;"> Haptic feedback is essential for dexterous telemanipulation that enables operators to control robotic hands remotely with high skill and precision, mimicking a human hand's natural movement and sensation. However, current haptic methods for dexterous telemanipulation cannot support torque feedback, resulting in object rotation and rolling mismatches. The operator must make tedious adjustments in t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20527v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20527v1-abstract-full" style="display: none;"> Haptic feedback is essential for dexterous telemanipulation that enables operators to control robotic hands remotely with high skill and precision, mimicking a human hand's natural movement and sensation. However, current haptic methods for dexterous telemanipulation cannot support torque feedback, resulting in object rotation and rolling mismatches. The operator must make tedious adjustments in these tasks, leading to delays, reduced situational awareness, and suboptimal task performance. This work presents a Bi-directional Momentum-based Haptic Feedback and Control (Bi-Hap) system for real-time dexterous telemanipulation. Bi-Hap integrates multi-modal sensors to extract human interactive information with the object and share it with the robot's learning-based controller. A Field-Oriented Control (FOC) algorithm is developed to enable the integrated brushless active momentum wheel to generate precise torque and vibrative feedback, bridging the gap between human intent and robotic actions. Different feedback strategies are designed for varying error states to align with the operator's intuition. Extensive experiments with human subjects using a virtual Shadow Dexterous Hand demonstrate the effectiveness of Bi-Hap in enhancing task performance and user confidence. Bi-Hap achieved real-time feedback capability with low command following latency (delay<0.025s) and highly accurate torque feedback (RMSE<0.010 Nm). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20527v1-abstract-full').style.display = 'none'; document.getElementById('2409.20527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20473">arXiv:2409.20473</a> <span> [<a href="https://arxiv.org/pdf/2409.20473">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Impact of Tactile Sensor Quantities and Placements on Learning-based Dexterous Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haoran Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengxiong Li</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+H">He Bai</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+L">Lingfeng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20473v1-abstract-short" style="display: inline;"> Tactile information effectively enables faster training and better task performance for learning-based in-hand manipulation. Existing approaches are validated in simulated environments with a large number of tactile sensors. However, attaching such sensors to a real robot hand is not applicable due to high cost and physical limitations. To enable real-world adoption of tactile sensors, this study… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20473v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20473v1-abstract-full" style="display: none;"> Tactile information effectively enables faster training and better task performance for learning-based in-hand manipulation. Existing approaches are validated in simulated environments with a large number of tactile sensors. However, attaching such sensors to a real robot hand is not applicable due to high cost and physical limitations. To enable real-world adoption of tactile sensors, this study investigates the impact of tactile sensors, including their varying quantities and placements on robot hands, on the dexterous manipulation task performance and analyzes the importance of each. Through empirically decreasing the sensor quantities, we successfully find an optimized set of tactile sensors (21 sensors) configuration, which keeps over 93% task performance with only 20% sensor quantities compared to the original set (92 sensors) for the block manipulation task, leading to a potential reduction of over 80% in sensor manufacturing and design costs. To transform the empirical results into a generalizable understanding, we build a task performance prediction model with a weighted linear regression algorithm and use it to forecast the task performance with different sensor configurations. To show its generalizability, we verified this model in egg and pen manipulation tasks and achieved an average prediction error of 3.12%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20473v1-abstract-full').style.display = 'none'; document.getElementById('2409.20473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20424">arXiv:2409.20424</a> <span> [<a href="https://arxiv.org/pdf/2409.20424">pdf</a>, <a href="https://arxiv.org/format/2409.20424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> World to Code: Multi-modal Data Generation via Self-Instructed Compositional Captioning and Filtering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiacong Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bohong Wu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haiyong Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xun Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xin Xiao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haoyuan Guo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20424v1-abstract-short" style="display: inline;"> Recent advances in Vision-Language Models (VLMs) and the scarcity of high-quality multi-modal alignment data have inspired numerous researches on synthetic VLM data generation. The conventional norm in VLM data construction uses a mixture of specialists in caption and OCR, or stronger VLM APIs and expensive human annotation. In this paper, we present World to Code (W2C), a meticulously curated mul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20424v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20424v1-abstract-full" style="display: none;"> Recent advances in Vision-Language Models (VLMs) and the scarcity of high-quality multi-modal alignment data have inspired numerous researches on synthetic VLM data generation. The conventional norm in VLM data construction uses a mixture of specialists in caption and OCR, or stronger VLM APIs and expensive human annotation. In this paper, we present World to Code (W2C), a meticulously curated multi-modal data construction pipeline that organizes the final generation output into a Python code format. The pipeline leverages the VLM itself to extract cross-modal information via different prompts and filter the generated outputs again via a consistency filtering strategy. Experiments have demonstrated the high quality of W2C by improving various existing visual question answering and visual grounding benchmarks across different VLMs. Further analysis also demonstrates that the new code parsing ability of VLMs presents better cross-modal equivalence than the commonly used detail caption ability. Our code is available at https://github.com/foundation-multimodal-models/World2Code. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20424v1-abstract-full').style.display = 'none'; document.getElementById('2409.20424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2024 Main Conference, 16pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18980">arXiv:2409.18980</a> <span> [<a href="https://arxiv.org/pdf/2409.18980">pdf</a>, <a href="https://arxiv.org/format/2409.18980">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IW-Bench: Evaluating Large Multimodal Models for Converting Image-to-Web </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongcheng Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junhao Chen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yaonan Gu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Junjia Du</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+B">Binyuan Hui</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianxin Ma</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chang Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhoujun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18980v1-abstract-short" style="display: inline;"> Recently advancements in large multimodal models have led to significant strides in image comprehension capabilities. Despite these advancements, there is a lack of the robust benchmark specifically for assessing the Image-to-Web conversion proficiency of these large models. Primarily, it is essential to ensure the integrity of the web elements generated. These elements comprise visible and invisi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18980v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18980v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18980v1-abstract-full" style="display: none;"> Recently advancements in large multimodal models have led to significant strides in image comprehension capabilities. Despite these advancements, there is a lack of the robust benchmark specifically for assessing the Image-to-Web conversion proficiency of these large models. Primarily, it is essential to ensure the integrity of the web elements generated. These elements comprise visible and invisible categories. Previous evaluation methods (e.g., BLEU) are notably susceptible to significant alterations due to the presence of invisible elements in Web. Furthermore, it is crucial to measure the layout information of web pages, referring to the positional relationships between elements, which is overlooked by previous work. To address challenges, we have curated and aligned a benchmark of images and corresponding web codes (IW-Bench). Specifically, we propose the Element Accuracy, which tests the completeness of the elements by parsing the Document Object Model (DOM) tree. Layout Accuracy is also proposed to analyze the positional relationships of elements by converting DOM tree into a common subsequence. Besides, we design a five-hop multimodal Chain-of-Thought Prompting for better performance, which contains five hop: 1) SoM prompt injection. 2) Inferring Elements. 3) Inferring Layout. 4) Inferring Web code. 5) Reflection. Our benchmark comprises 1200 pairs of images and web codes with varying levels of difficulty. We have conducted extensive experiments on existing large multimodal models, offering insights into their performance and areas for improvement in image-to-web domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18980v1-abstract-full').style.display = 'none'; document.getElementById('2409.18980v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17346">arXiv:2409.17346</a> <span> [<a href="https://arxiv.org/pdf/2409.17346">pdf</a>, <a href="https://arxiv.org/format/2409.17346">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Multi-Tier Preservation of Discrete Morse Smale Complexes in Error-Bounded Lossy Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuxiao Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xin Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bei Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanqi Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17346v1-abstract-short" style="display: inline;"> We propose a multi-tier paradigm to preserve various components of Morse-Smale complexes in lossy compressed scalar fields, including extrema, saddles, separatrices, and persistence diagrams. Existing error-bounded lossy compressors rarely consider preserving topological structures such as discrete Morse-Smale complexes, leading to significant inaccuracies in data interpretation and potentially re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17346v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17346v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17346v1-abstract-full" style="display: none;"> We propose a multi-tier paradigm to preserve various components of Morse-Smale complexes in lossy compressed scalar fields, including extrema, saddles, separatrices, and persistence diagrams. Existing error-bounded lossy compressors rarely consider preserving topological structures such as discrete Morse-Smale complexes, leading to significant inaccuracies in data interpretation and potentially resulting in incorrect scientific conclusions. This paper mainly focuses on preserving the Morse-Smale complexes in 2D or 3D discrete scalar fields by precisely preserving critical simplices and the separatrices that connect them. Our approach generates a series of edits during compression time, which are applied to the decompressed data to accurately reconstruct the complexes while maintaining the error within prescribed bounds. We design a workflow that iteratively fixes critical simplices and separatrices in alternating steps until convergence within finite iterations. Our approach addresses diverse application needs by offering users flexible options to balance compression efficiency and feature preservation. To enable effective integration with lossy compressors, we use GPU parallelism to enhance the performance of each workflow component. We conduct experiments on various datasets to demonstrate the effectiveness of our method in accurately preserving Morse-Smale complexes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17346v1-abstract-full').style.display = 'none'; document.getElementById('2409.17346v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages,11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15272">arXiv:2409.15272</a> <span> [<a href="https://arxiv.org/pdf/2409.15272">pdf</a>, <a href="https://arxiv.org/format/2409.15272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OmniBench: Towards The Future of Universal Omni-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yinghao Ma</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kang Zhu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yiming Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Siwei Wu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+X">Xingwei Qu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jinjie Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenzhu Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangzhou Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zachary Liu</a>, <a href="/search/cs?searchtype=author&query=Benetos%2C+E">Emmanouil Benetos</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenhao Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenghua Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15272v3-abstract-short" style="display: inline;"> Recent advancements in multimodal large language models (MLLMs) have aimed to integrate and interpret data across diverse modalities. However, the capacity of these models to concurrently process and reason about multiple modalities remains inadequately explored, partly due to the lack of comprehensive modality-wise benchmarks. We introduce OmniBench, a novel benchmark designed to rigorously evalu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15272v3-abstract-full').style.display = 'inline'; document.getElementById('2409.15272v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15272v3-abstract-full" style="display: none;"> Recent advancements in multimodal large language models (MLLMs) have aimed to integrate and interpret data across diverse modalities. However, the capacity of these models to concurrently process and reason about multiple modalities remains inadequately explored, partly due to the lack of comprehensive modality-wise benchmarks. We introduce OmniBench, a novel benchmark designed to rigorously evaluate models' ability to recognize, interpret, and reason across visual, acoustic, and textual inputs simultaneously. We define models capable of such tri-modal processing as omni-language models (OLMs). OmniBench is distinguished by high-quality human annotations, ensuring that accurate responses require integrated understanding and reasoning across all three modalities. Our main findings reveal that: i) most OLMs exhibit critical limitations in instruction-following and reasoning capabilities within tri-modal contexts; and ii) most baselines models perform poorly (below 50\% accuracy) even when provided with alternative textual representations of images or/and audio. These results suggest that the ability to construct a consistent context from text, image, and audio is often overlooked in existing MLLM training paradigms. To address this gap, we curate an instruction tuning dataset of 84.5K training samples, OmniInstruct, for training OLMs to adapt to multimodal contexts. We advocate for future research to focus on developing more robust tri-modal integration techniques and training strategies to enhance OLM performance across diverse modalities. The codes and live leaderboard could be found at https://m-a-p.ai/OmniBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15272v3-abstract-full').style.display = 'none'; document.getElementById('2409.15272v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11630">arXiv:2409.11630</a> <span> [<a href="https://arxiv.org/pdf/2409.11630">pdf</a>, <a href="https://arxiv.org/format/2409.11630">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speaking from Coarse to Fine: Improving Neural Codec Language Model via Multi-Scale Speech Coding and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haohan Guo</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+F">Fenglong Xie</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11630v1-abstract-short" style="display: inline;"> The neural codec language model (CLM) has demonstrated remarkable performance in text-to-speech (TTS) synthesis. However, troubled by ``recency bias", CLM lacks sufficient attention to coarse-grained information at a higher temporal scale, often producing unnatural or even unintelligible speech. This work proposes CoFi-Speech, a coarse-to-fine CLM-TTS approach, employing multi-scale speech coding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11630v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11630v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11630v1-abstract-full" style="display: none;"> The neural codec language model (CLM) has demonstrated remarkable performance in text-to-speech (TTS) synthesis. However, troubled by ``recency bias", CLM lacks sufficient attention to coarse-grained information at a higher temporal scale, often producing unnatural or even unintelligible speech. This work proposes CoFi-Speech, a coarse-to-fine CLM-TTS approach, employing multi-scale speech coding and generation to address this issue. We train a multi-scale neural codec, CoFi-Codec, to encode speech into a multi-scale discrete representation, comprising multiple token sequences with different time resolutions. Then, we propose CoFi-LM that can generate this representation in two modes: the single-LM-based chain-of-scale generation and the multiple-LM-based stack-of-scale generation. In experiments, CoFi-Speech significantly outperforms single-scale baseline systems on naturalness and speaker similarity in zero-shot TTS. The analysis of multi-scale coding demonstrates the effectiveness of CoFi-Codec in learning multi-scale discrete speech representations while keeping high-quality speech reconstruction. The coarse-to-fine multi-scale generation, especially for the stack-of-scale approach, is also validated as a crucial approach in pursuing a high-quality neural codec language model for TTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11630v1-abstract-full').style.display = 'none'; document.getElementById('2409.11630v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10584">arXiv:2409.10584</a> <span> [<a href="https://arxiv.org/pdf/2409.10584">pdf</a>, <a href="https://arxiv.org/format/2409.10584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Manifold-Constrained Nucleus-Level Denoising Diffusion Model for Structure-Based Drug Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shengchao Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+D">Divin Yan</a>, <a href="/search/cs?searchtype=author&query=Du%2C+W">Weitao Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiyang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoxinran Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongyu Guo</a>, <a href="/search/cs?searchtype=author&query=Borgs%2C+C">Christian Borgs</a>, <a href="/search/cs?searchtype=author&query=Chayes%2C+J">Jennifer Chayes</a>, <a href="/search/cs?searchtype=author&query=Anandkumar%2C+A">Anima Anandkumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10584v2-abstract-short" style="display: inline;"> Artificial intelligence models have shown great potential in structure-based drug design, generating ligands with high binding affinities. However, existing models have often overlooked a crucial physical constraint: atoms must maintain a minimum pairwise distance to avoid separation violation, a phenomenon governed by the balance of attractive and repulsive forces. To mitigate such separation vio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10584v2-abstract-full').style.display = 'inline'; document.getElementById('2409.10584v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10584v2-abstract-full" style="display: none;"> Artificial intelligence models have shown great potential in structure-based drug design, generating ligands with high binding affinities. However, existing models have often overlooked a crucial physical constraint: atoms must maintain a minimum pairwise distance to avoid separation violation, a phenomenon governed by the balance of attractive and repulsive forces. To mitigate such separation violations, we propose NucleusDiff. It models the interactions between atomic nuclei and their surrounding electron clouds by enforcing the distance constraint between the nuclei and manifolds. We quantitatively evaluate NucleusDiff using the CrossDocked2020 dataset and a COVID-19 therapeutic target, demonstrating that NucleusDiff reduces violation rate by up to 100.00% and enhances binding affinity by up to 22.16%, surpassing state-of-the-art models for structure-based drug design. We also provide qualitative analysis through manifold sampling, visually confirming the effectiveness of NucleusDiff in reducing separation violations and improving binding affinities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10584v2-abstract-full').style.display = 'none'; document.getElementById('2409.10584v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10072">arXiv:2409.10072</a> <span> [<a href="https://arxiv.org/pdf/2409.10072">pdf</a>, <a href="https://arxiv.org/format/2409.10072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speaker Contrastive Learning for Source Speaker Tracing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongmei Guo</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jian Kang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+M">Mengjie Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jie Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao-Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10072v1-abstract-short" style="display: inline;"> As a form of biometric authentication technology, the security of speaker verification systems is of utmost importance. However, SV systems are inherently vulnerable to various types of attacks that can compromise their accuracy and reliability. One such attack is voice conversion, which modifies a persons speech to sound like another person by altering various vocal characteristics. This poses a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10072v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10072v1-abstract-full" style="display: none;"> As a form of biometric authentication technology, the security of speaker verification systems is of utmost importance. However, SV systems are inherently vulnerable to various types of attacks that can compromise their accuracy and reliability. One such attack is voice conversion, which modifies a persons speech to sound like another person by altering various vocal characteristics. This poses a significant threat to SV systems. To address this challenge, the Source Speaker Tracing Challenge in IEEE SLT2024 aims to identify the source speaker information in manipulated speech signals. Specifically, SSTC focuses on source speaker verification against voice conversion to determine whether two converted speech samples originate from the same source speaker. In this study, we propose a speaker contrastive learning-based approach for source speaker tracing to learn the latent source speaker information in converted speech. To learn a more source-speaker-related representation, we employ speaker contrastive loss during the training of the embedding extractor. This speaker contrastive loss helps identify the true source speaker embedding among several distractor speaker embeddings, enabling the embedding extractor to learn the potentially possessing source speaker information present in the converted speech. Experiments demonstrate that our proposed speaker contrastive learning system achieves the lowest EER of 16.788% on the challenge test set, securing first place in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10072v1-abstract-full').style.display = 'none'; document.getElementById('2409.10072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 2 figures, accepted by SLT</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09653">arXiv:2409.09653</a> <span> [<a href="https://arxiv.org/pdf/2409.09653">pdf</a>, <a href="https://arxiv.org/format/2409.09653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KAN v.s. MLP for Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haihong Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fengxin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiao Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongyan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09653v1-abstract-short" style="display: inline;"> Kolmogorov-Arnold Networks (KAN) is an emerging neural network architecture in machine learning. It has greatly interested the research community about whether KAN can be a promising alternative of the commonly used Multi-Layer Perceptions (MLP). Experiments in various fields demonstrated that KAN-based machine learning can achieve comparable if not better performance than MLP-based methods, but w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09653v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09653v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09653v1-abstract-full" style="display: none;"> Kolmogorov-Arnold Networks (KAN) is an emerging neural network architecture in machine learning. It has greatly interested the research community about whether KAN can be a promising alternative of the commonly used Multi-Layer Perceptions (MLP). Experiments in various fields demonstrated that KAN-based machine learning can achieve comparable if not better performance than MLP-based methods, but with much smaller parameter scales and are more explainable. In this paper, we explore the incorporation of KAN into the actor and critic networks for offline reinforcement learning (RL). We evaluated the performance, parameter scales, and training efficiency of various KAN and MLP based conservative Q-learning (CQL) on the the classical D4RL benchmark for offline RL. Our study demonstrates that KAN can achieve performance close to the commonly used MLP with significantly fewer parameters. This provides us an option to choose the base networks according to the requirements of the offline RL tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09653v1-abstract-full').style.display = 'none'; document.getElementById('2409.09653v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages,2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07957">arXiv:2409.07957</a> <span> [<a href="https://arxiv.org/pdf/2409.07957">pdf</a>, <a href="https://arxiv.org/format/2409.07957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rapid Parameter Estimation for Extreme Mass Ratio Inspirals Using Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+B">Bo Liang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hong Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tianyu Zhao</a>, <a href="/search/cs?searchtype=author&query=wang%2C+H">He wang</a>, <a href="/search/cs?searchtype=author&query=Evangelinelis%2C+H">Herik Evangelinelis</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuxiang Xu</a>, <a href="/search/cs?searchtype=author&query=liu%2C+C">Chang liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+M">Manjia Liang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xiaotong Wei</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yong Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Peng Xu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+M">Minghui Du</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+W">Wei-Liang Qian</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Ziren Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07957v1-abstract-short" style="display: inline;"> Extreme-mass-ratio inspiral (EMRI) signals pose significant challenges in gravitational wave (GW) astronomy owing to their low-frequency nature and highly complex waveforms, which occupy a high-dimensional parameter space with numerous variables. Given their extended inspiral timescales and low signal-to-noise ratios, EMRI signals warrant prolonged observation periods. Parameter estimation becomes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07957v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07957v1-abstract-full" style="display: none;"> Extreme-mass-ratio inspiral (EMRI) signals pose significant challenges in gravitational wave (GW) astronomy owing to their low-frequency nature and highly complex waveforms, which occupy a high-dimensional parameter space with numerous variables. Given their extended inspiral timescales and low signal-to-noise ratios, EMRI signals warrant prolonged observation periods. Parameter estimation becomes particularly challenging due to non-local parameter degeneracies, arising from multiple local maxima, as well as flat regions and ridges inherent in the likelihood function. These factors lead to exceptionally high time complexity for parameter analysis while employing traditional matched filtering and random sampling methods. To address these challenges, the present study applies machine learning to Bayesian posterior estimation of EMRI signals, leveraging the recently developed flow matching technique based on ODE neural networks. Our approach demonstrates computational efficiency several orders of magnitude faster than the traditional Markov Chain Monte Carlo (MCMC) methods, while preserving the unbiasedness of parameter estimation. We show that machine learning technology has the potential to efficiently handle the vast parameter space, involving up to seventeen parameters, associated with EMRI signals. Furthermore, to our knowledge, this is the first instance of applying machine learning, specifically the Continuous Normalizing Flows (CNFs), to EMRI signal analysis. Our findings highlight the promising potential of machine learning in EMRI waveform analysis, offering new perspectives for the advancement of space-based GW detection and GW astronomy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07957v1-abstract-full').style.display = 'none'; document.getElementById('2409.07957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Guo%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Guo%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Guo%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository