CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 1,797 results for author: <span class="mathjax">Wang, R</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Wang%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+R&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14053">arXiv:2411.14053</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14053">pdf</a>, <a href="https://arxiv.org/format/2411.14053">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stereo Anything: Unifying Stereo Matching with Large-Scale Mixed Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+X">Xianda Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chenming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+D">Dujun Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruilin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+W">Wenzhao Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14053v1-abstract-short" style="display: inline;"> Stereo matching has been a pivotal component in 3D vision, aiming to find corresponding points between pairs of stereo images to recover depth information. In this work, we introduce StereoAnything, a highly practical solution for robust stereo matching. Rather than focusing on a specialized model, our goal is to develop a versatile foundational model capable of handling stereo images across diver&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14053v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14053v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14053v1-abstract-full" style="display: none;"> Stereo matching has been a pivotal component in 3D vision, aiming to find corresponding points between pairs of stereo images to recover depth information. In this work, we introduce StereoAnything, a highly practical solution for robust stereo matching. Rather than focusing on a specialized model, our goal is to develop a versatile foundational model capable of handling stereo images across diverse environments. To this end, we scale up the dataset by collecting labeled stereo images and generating synthetic stereo pairs from unlabeled monocular images. To further enrich the model&#39;s ability to generalize across different conditions, we introduce a novel synthetic dataset that complements existing data by adding variability in baselines, camera angles, and scene types. We extensively evaluate the zero-shot capabilities of our model on five public datasets, showcasing its impressive ability to generalize to new, unseen data. Code will be available at \url{https://github.com/XiandaGuo/OpenStereo}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14053v1-abstract-full').style.display = 'none'; document.getElementById('2411.14053v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code will be available at \url{https://github.com/XiandaGuo/OpenStereo}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13054">arXiv:2411.13054</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13054">pdf</a>, <a href="https://arxiv.org/format/2411.13054">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Generalized Ping-Pong: Off-Chip Memory Bandwidth Centric Pipelining Strategy for Processing-In-Memory Accelerators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruibao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+B">Bonan Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13054v1-abstract-short" style="display: inline;"> Processing-in-memory (PIM) is a promising choice for accelerating deep neural networks (DNNs) featuring high efficiency and low power. However, the rapid upscaling of neural network model sizes poses a crucial challenge for the limited on-chip PIM capacity. When the PIM presumption of &#34;pre-loading DNN weights/parameters only once before repetitive computing&#34; is no longer practical, concurrent writ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13054v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13054v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13054v1-abstract-full" style="display: none;"> Processing-in-memory (PIM) is a promising choice for accelerating deep neural networks (DNNs) featuring high efficiency and low power. However, the rapid upscaling of neural network model sizes poses a crucial challenge for the limited on-chip PIM capacity. When the PIM presumption of &#34;pre-loading DNN weights/parameters only once before repetitive computing&#34; is no longer practical, concurrent writing and computing techniques become necessary for PIM. Conventional methods of naive ping-pong or in~situ concurrent write/compute scheduling for PIM cause low utilization of off-chip memory bandwidth, subsequently offsetting the efficiency gain brought by PIM technology. To address this challenge, we propose an off-chip memory bandwidth centric pipelining strategy, named &#34;generalized ping-pong&#34;, to maximize the utilization and performance of PIM accelerators toward large DNN models. The core idea of the proposed generalized ping-pong strategy is to evenly distribute the active time and fully utilize the off-chip memory bandwidth. Based on a programmable and scalable SRAM PIM architecture, we quantitatively analyze and compare the generalized ping-pong with the conventional scheduling strategies of naive ping-pong and in-situ write/compute for PIM. Experiments show that the generalized ping-pong strategy achieves acceleration of over 1.67 times when fully utilizing the off-chip memory bandwidth. When further limiting the off-chip memory bandwidth ranging in 8~256 bytes per clock cycle, the proposed generalized ping-pong strategy accelerates 1.22~7.71 times versus naive ping-pong. The developed PIM accelerator design with the generalized ping-poing strategy is open-sourced at https://github.com/rw999creator/gpp-pim. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13054v1-abstract-full').style.display = 'none'; document.getElementById('2411.13054v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12690">arXiv:2411.12690</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12690">pdf</a>, <a href="https://arxiv.org/format/2411.12690">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> MORE-Stress: Model Order Reduction based Efficient Numerical Algorithm for Thermal Stress Simulation of TSV Arrays in 2.5D/3D IC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+T">Tianxiang Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qipan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yibo Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Runsheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+R">Ru Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12690v1-abstract-short" style="display: inline;"> Thermomechanical stress induced by through-silicon vias (TSVs) plays an important role in the performance and reliability analysis of 2.5D/3D ICs. While the finite element method (FEM) adopted by commercial software can provide accurate simulation results, it is very time- and memory-consuming for large-scale analysis. Over the past decade, the linear superposition method has been utilized to perf&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12690v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12690v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12690v1-abstract-full" style="display: none;"> Thermomechanical stress induced by through-silicon vias (TSVs) plays an important role in the performance and reliability analysis of 2.5D/3D ICs. While the finite element method (FEM) adopted by commercial software can provide accurate simulation results, it is very time- and memory-consuming for large-scale analysis. Over the past decade, the linear superposition method has been utilized to perform fast thermal stress estimations of TSV arrays, but it suffers from a lack of accuracy. In this paper, we propose MORE-Stress, a novel strict numerical algorithm for efficient thermal stress simulation of TSV arrays based on model order reduction. Extensive experimental results demonstrate that our algorithm can realize a 153-504 times reduction in computational time and a 39-115 times reduction in memory usage compared with the commercial software ANSYS, with negligible errors less than 1%. Our algorithm is as efficient as the linear superposition method, with an order of magnitude smaller errors and fast convergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12690v1-abstract-full').style.display = 'none'; document.getElementById('2411.12690v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2025 Design, Automation &amp; Test in Europe Conference &amp; Exhibition (DATE)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12197">arXiv:2411.12197</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12197">pdf</a>, <a href="https://arxiv.org/format/2411.12197">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-981-97-8508-7_12">10.1007/978-981-97-8508-7_12 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MTFusion: Reconstructing Any 3D Object from Single Image Using Multi-word Textual Inversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruowei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaqi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zixiang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Q">Qijun Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12197v1-abstract-short" style="display: inline;"> Reconstructing 3D models from single-view images is a long-standing problem in computer vision. The latest advances for single-image 3D reconstruction extract a textual description from the input image and further utilize it to synthesize 3D models. However, existing methods focus on capturing a single key attribute of the image (e.g., object type, artistic style) and fail to consider the multi-pe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12197v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12197v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12197v1-abstract-full" style="display: none;"> Reconstructing 3D models from single-view images is a long-standing problem in computer vision. The latest advances for single-image 3D reconstruction extract a textual description from the input image and further utilize it to synthesize 3D models. However, existing methods focus on capturing a single key attribute of the image (e.g., object type, artistic style) and fail to consider the multi-perspective information required for accurate 3D reconstruction, such as object shape and material properties. Besides, the reliance on Neural Radiance Fields hinders their ability to reconstruct intricate surfaces and texture details. In this work, we propose MTFusion, which leverages both image data and textual descriptions for high-fidelity 3D reconstruction. Our approach consists of two stages. First, we adopt a novel multi-word textual inversion technique to extract a detailed text description capturing the image&#39;s characteristics. Then, we use this description and the image to generate a 3D model with FlexiCubes. Additionally, MTFusion enhances FlexiCubes by employing a special decoder network for Signed Distance Functions, leading to faster training and finer surface representation. Extensive evaluations demonstrate that our MTFusion surpasses existing image-to-3D methods on a wide range of synthetic and real-world images. Furthermore, the ablation study proves the effectiveness of our network designs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12197v1-abstract-full').style.display = 'none'; document.getElementById('2411.12197v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">PRCV 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Pattern Recognition and Computer Vision (2025), Springer Nature Singapore, pages 166-180, ISBN 978-981-97-8508-7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11839">arXiv:2411.11839</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11839">pdf</a>, <a href="https://arxiv.org/format/2411.11839">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RoboGSim: A Real2Sim2Real Robotic Gaussian Splatting Simulator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xinhai Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jialin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Ziheng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+F">Fan Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Tiancai Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+H">Haoqiang Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Tseng%2C+K">Kuo-Kun Tseng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruiping Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11839v1-abstract-short" style="display: inline;"> Efficient acquisition of real-world embodied data has been increasingly critical. However, large-scale demonstrations captured by remote operation tend to take extremely high costs and fail to scale up the data size in an efficient manner. Sampling the episodes under a simulated environment is a promising way for large-scale collection while existing simulators fail to high-fidelity modeling on te&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11839v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11839v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11839v1-abstract-full" style="display: none;"> Efficient acquisition of real-world embodied data has been increasingly critical. However, large-scale demonstrations captured by remote operation tend to take extremely high costs and fail to scale up the data size in an efficient manner. Sampling the episodes under a simulated environment is a promising way for large-scale collection while existing simulators fail to high-fidelity modeling on texture and physics. To address these limitations, we introduce the RoboGSim, a real2sim2real robotic simulator, powered by 3D Gaussian Splatting and the physics engine. RoboGSim mainly includes four parts: Gaussian Reconstructor, Digital Twins Builder, Scene Composer, and Interactive Engine. It can synthesize the simulated data with novel views, objects, trajectories, and scenes. RoboGSim also provides an online, reproducible, and safe evaluation for different manipulation policies. The real2sim and sim2real transfer experiments show a high consistency in the texture and physics. Moreover, the effectiveness of synthetic data is validated under the real-world manipulated tasks. We hope RoboGSim serves as a closed-loop simulator for fair comparison on policy learning. More information can be found on our project page https://robogsim.github.io/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11839v1-abstract-full').style.display = 'none'; document.getElementById('2411.11839v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11668">arXiv:2411.11668</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11668">pdf</a>, <a href="https://arxiv.org/format/2411.11668">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Robust Continual Graph Learning for Graph Classification in Biology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Ding Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Downer%2C+J">Jane Downer</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Can Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ren Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11668v1-abstract-short" style="display: inline;"> Graph classification is essential for understanding complex biological systems, where molecular structures and interactions are naturally represented as graphs. Traditional graph neural networks (GNNs) perform well on static tasks but struggle in dynamic settings due to catastrophic forgetting. We present Perturbed and Sparsified Continual Graph Learning (PSCGL), a robust and efficient continual g&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11668v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11668v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11668v1-abstract-full" style="display: none;"> Graph classification is essential for understanding complex biological systems, where molecular structures and interactions are naturally represented as graphs. Traditional graph neural networks (GNNs) perform well on static tasks but struggle in dynamic settings due to catastrophic forgetting. We present Perturbed and Sparsified Continual Graph Learning (PSCGL), a robust and efficient continual graph learning framework for graph data classification, specifically targeting biological datasets. We introduce a perturbed sampling strategy to identify critical data points that contribute to model learning and a motif-based graph sparsification technique to reduce storage needs while maintaining performance. Additionally, our PSCGL framework inherently defends against graph backdoor attacks, which is crucial for applications in sensitive biological contexts. Extensive experiments on biological datasets demonstrate that PSCGL not only retains knowledge across tasks but also enhances the efficiency and robustness of graph classification models in biology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11668v1-abstract-full').style.display = 'none'; document.getElementById('2411.11668v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11406">arXiv:2411.11406</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11406">pdf</a>, <a href="https://arxiv.org/format/2411.11406">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Resource Gap: Deploying Advanced Imitation Learning Models onto Affordable Embedded Platforms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ge%2C+H">Haizhou Ge</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruixiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhu-ang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+H">Hongrui Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+R">Ruichen Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuhang Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Pang%2C+Z">Zeyu Pang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+G">Guyue Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Junyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+L">Lu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11406v1-abstract-short" style="display: inline;"> Advanced imitation learning with structures like the transformer is increasingly demonstrating its advantages in robotics. However, deploying these large-scale models on embedded platforms remains a major challenge. In this paper, we propose a pipeline that facilitates the migration of advanced imitation learning algorithms to edge devices. The process is achieved via an efficient model compressio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11406v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11406v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11406v1-abstract-full" style="display: none;"> Advanced imitation learning with structures like the transformer is increasingly demonstrating its advantages in robotics. However, deploying these large-scale models on embedded platforms remains a major challenge. In this paper, we propose a pipeline that facilitates the migration of advanced imitation learning algorithms to edge devices. The process is achieved via an efficient model compression method and a practical asynchronous parallel method Temporal Ensemble with Dropped Actions (TEDA) that enhances the smoothness of operations. To show the efficiency of the proposed pipeline, large-scale imitation learning models are trained on a server and deployed on an edge device to complete various manipulation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11406v1-abstract-full').style.display = 'none'; document.getElementById('2411.11406v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 2024 IEEE International Conference on Robotics and Biomimetics (IEEE ROBIO 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11305">arXiv:2411.11305</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11305">pdf</a>, <a href="https://arxiv.org/ps/2411.11305">ps</a>, <a href="https://arxiv.org/format/2411.11305">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TP-UNet: Temporal Prompt Guided UNet for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ranmin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+L">Limin Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hongkun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+B">Boyan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+R">Ruichu Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11305v2-abstract-short" style="display: inline;"> The advancement of medical image segmentation techniques has been propelled by the adoption of deep learning techniques, particularly UNet-based approaches, which exploit semantic information to improve the accuracy of segmentations. However, the order of organs in scanned images has been disregarded by current medical image segmentation approaches based on UNet. Furthermore, the inherent network&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11305v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11305v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11305v2-abstract-full" style="display: none;"> The advancement of medical image segmentation techniques has been propelled by the adoption of deep learning techniques, particularly UNet-based approaches, which exploit semantic information to improve the accuracy of segmentations. However, the order of organs in scanned images has been disregarded by current medical image segmentation approaches based on UNet. Furthermore, the inherent network structure of UNet does not provide direct capabilities for integrating temporal information. To efficiently integrate temporal information, we propose TP-UNet that utilizes temporal prompts, encompassing organ-construction relationships, to guide the segmentation UNet model. Specifically, our framework is featured with cross-attention and semantic alignment based on unsupervised contrastive learning to combine temporal prompts and image features effectively. Extensive evaluations on two medical image segmentation datasets demonstrate the state-of-the-art performance of TP-UNet. Our implementation will be open-sourced after acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11305v2-abstract-full').style.display = 'none'; document.getElementById('2411.11305v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10683">arXiv:2411.10683</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10683">pdf</a>, <a href="https://arxiv.org/format/2411.10683">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> I&#39;m Spartacus, No, I&#39;m Spartacus: Measuring and Understanding LLM Identity Confusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+S">Shichao Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Minghui Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruoxi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kaidi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+X">Xinwen Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+X">Xiuzhen Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10683v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) excel in diverse tasks such as text generation, data analysis, and software development, making them indispensable across domains like education, business, and creative industries. However, the rapid proliferation of LLMs (with over 560 companies developing or deploying them as of 2024) has raised concerns about their originality and trustworthiness. A notable issue, t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10683v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10683v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10683v1-abstract-full" style="display: none;"> Large Language Models (LLMs) excel in diverse tasks such as text generation, data analysis, and software development, making them indispensable across domains like education, business, and creative industries. However, the rapid proliferation of LLMs (with over 560 companies developing or deploying them as of 2024) has raised concerns about their originality and trustworthiness. A notable issue, termed identity confusion, has emerged, where LLMs misrepresent their origins or identities. This study systematically examines identity confusion through three research questions: (1) How prevalent is identity confusion among LLMs? (2) Does it arise from model reuse, plagiarism, or hallucination? (3) What are the security and trust-related impacts of identity confusion? To address these, we developed an automated tool combining documentation analysis, self-identity recognition testing, and output similarity comparisons--established methods for LLM fingerprinting--and conducted a structured survey via Credamo to assess its impact on user trust. Our analysis of 27 LLMs revealed that 25.93% exhibit identity confusion. Output similarity analysis confirmed that these issues stem from hallucinations rather than replication or reuse. Survey results further highlighted that identity confusion significantly erodes trust, particularly in critical tasks like education and professional use, with declines exceeding those caused by logical errors or inconsistencies. Users attributed these failures to design flaws, incorrect training data, and perceived plagiarism, underscoring the systemic risks posed by identity confusion to LLM reliability and trustworthiness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10683v1-abstract-full').style.display = 'none'; document.getElementById('2411.10683v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 8 figure, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10679">arXiv:2411.10679</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10679">pdf</a>, <a href="https://arxiv.org/format/2411.10679">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SPDFusion: An Infrared and Visible Image Fusion Network Based on a Non-Euclidean Representation of Riemannian Manifolds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kang%2C+H">Huan Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+T">Tianyang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiao-Jun Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Kittler%2C+J">Josef Kittler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10679v1-abstract-short" style="display: inline;"> Euclidean representation learning methods have achieved commendable results in image fusion tasks, which can be attributed to their clear advantages in handling with linear space. However, data collected from a realistic scene usually have a non-Euclidean structure, where Euclidean metric might be limited in representing the true data relationships, degrading fusion performance. To address this is&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10679v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10679v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10679v1-abstract-full" style="display: none;"> Euclidean representation learning methods have achieved commendable results in image fusion tasks, which can be attributed to their clear advantages in handling with linear space. However, data collected from a realistic scene usually have a non-Euclidean structure, where Euclidean metric might be limited in representing the true data relationships, degrading fusion performance. To address this issue, a novel SPD (symmetric positive definite) manifold learning framework is proposed for multi-modal image fusion, named SPDFusion, which extends the image fusion approach from the Euclidean space to the SPD manifolds. Specifically, we encode images according to the Riemannian geometry to exploit their intrinsic statistical correlations, thereby aligning with human visual perception. Actually, the SPD matrix underpins our network learning, with a cross-modal fusion strategy employed to harness modality-specific dependencies and augment complementary information. Subsequently, an attention module is designed to process the learned weight matrix, facilitating the weighting of spatial global correlation semantics via SPD matrix multiplication. Based on this, we design an end-to-end fusion network based on cross-modal manifold learning. Extensive experiments on public datasets demonstrate that our framework exhibits superior performance compared to the current state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10679v1-abstract-full').style.display = 'none'; document.getElementById('2411.10679v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 12 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10673">arXiv:2411.10673</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10673">pdf</a>, <a href="https://arxiv.org/format/2411.10673">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> How to Defend Against Large-scale Model Poisoning Attacks in Federated Learning: A Vertical Solution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jinbo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruijin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fengli Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10673v1-abstract-short" style="display: inline;"> Federated learning (FL) is vulnerable to model poisoning attacks due to its distributed nature. The current defenses start from all user gradients (model updates) in each communication round and solve for the optimal aggregation gradients (horizontal solution). This horizontal solution will completely fail when facing large-scale (&gt;50%) model poisoning attacks. In this work, based on the key insig&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10673v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10673v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10673v1-abstract-full" style="display: none;"> Federated learning (FL) is vulnerable to model poisoning attacks due to its distributed nature. The current defenses start from all user gradients (model updates) in each communication round and solve for the optimal aggregation gradients (horizontal solution). This horizontal solution will completely fail when facing large-scale (&gt;50%) model poisoning attacks. In this work, based on the key insight that the convergence process of the model is a highly predictable process, we break away from the traditional horizontal solution of defense and innovatively transform the problem of solving the optimal aggregation gradients into a vertical solution problem. We propose VERT, which uses global communication rounds as the vertical axis, trains a predictor using historical gradients information to predict user gradients, and compares the similarity with actual user gradients to precisely and efficiently select the optimal aggregation gradients. In order to reduce the computational complexity of VERT, we design a low dimensional vector projector to project the user gradients to a computationally acceptable length, and then perform subsequent predictor training and prediction tasks. Exhaustive experiments show that VERT is efficient and scalable, exhibiting excellent large-scale (&gt;=80%) model poisoning defense effects under different FL scenarios. In addition, we can design projector with different structures for different model structures to adapt to aggregation servers with different computing power. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10673v1-abstract-full').style.display = 'none'; document.getElementById('2411.10673v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10665">arXiv:2411.10665</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10665">pdf</a>, <a href="https://arxiv.org/format/2411.10665">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> AutoIoT: Automated IoT Platform Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Ye Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Minghui Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruoxi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Lian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10665v1-abstract-short" style="display: inline;"> IoT platforms, particularly smart home platforms providing significant convenience to people&#39;s lives such as Apple HomeKit and Samsung SmartThings, allow users to create automation rules through trigger-action programming. However, some users may lack the necessary knowledge to formulate automation rules, thus preventing them from fully benefiting from the conveniences offered by smart home techno&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10665v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10665v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10665v1-abstract-full" style="display: none;"> IoT platforms, particularly smart home platforms providing significant convenience to people&#39;s lives such as Apple HomeKit and Samsung SmartThings, allow users to create automation rules through trigger-action programming. However, some users may lack the necessary knowledge to formulate automation rules, thus preventing them from fully benefiting from the conveniences offered by smart home technology. To address this, smart home platforms provide pre-defined automation policies based on the smart home devices registered by the user. Nevertheless, these policies, being pre-generated and relatively simple, fail to adequately cover the diverse needs of users. Furthermore, conflicts may arise between automation rules, and integrating conflict detection into the IoT platform increases the burden on developers. In this paper, we propose AutoIoT, an automated IoT platform based on Large Language Models (LLMs) and formal verification techniques, designed to achieve end-to-end automation through device information extraction, LLM-based rule generation, conflict detection, and avoidance. AutoIoT can help users generate conflict-free automation rules and assist developers in generating codes for conflict detection, thereby enhancing their experience. A code adapter has been designed to separate logical reasoning from the syntactic details of code generation, enabling LLMs to generate code for programming languages beyond their training data. Finally, we evaluated the performance of AutoIoT and presented a case study demonstrating how AutoIoT can integrate with existing IoT platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10665v1-abstract-full').style.display = 'none'; document.getElementById('2411.10665v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09492">arXiv:2411.09492</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09492">pdf</a>, <a href="https://arxiv.org/format/2411.09492">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MM-Eval: A Hierarchical Benchmark for Modern Mongolian Evaluation in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Mengyuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruihui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+B">Bo Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Yuan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+X">Xiaobing Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09492v1-abstract-short" style="display: inline;"> Large language models (LLMs) excel in high-resource languages but face notable challenges in low-resource languages like Mongolian. This paper addresses these challenges by categorizing capabilities into language abilities (syntax and semantics) and cognitive abilities (knowledge and reasoning). To systematically evaluate these areas, we developed MM-Eval, a specialized dataset based on Modern Mon&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09492v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09492v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09492v1-abstract-full" style="display: none;"> Large language models (LLMs) excel in high-resource languages but face notable challenges in low-resource languages like Mongolian. This paper addresses these challenges by categorizing capabilities into language abilities (syntax and semantics) and cognitive abilities (knowledge and reasoning). To systematically evaluate these areas, we developed MM-Eval, a specialized dataset based on Modern Mongolian Language Textbook I and enriched with WebQSP and MGSM datasets. Preliminary experiments on models including Qwen2-7B-Instruct, GLM4-9b-chat, Llama3.1-8B-Instruct, GPT-4, and DeepseekV2.5 revealed that: 1) all models performed better on syntactic tasks than semantic tasks, highlighting a gap in deeper language understanding; and 2) knowledge tasks showed a moderate decline, suggesting that models can transfer general knowledge from high-resource to low-resource contexts. The release of MM-Eval, comprising 569 syntax, 677 semantics, 344 knowledge, and 250 reasoning tasks, offers valuable insights for advancing NLP and LLMs in low-resource languages like Mongolian. The dataset is available at https://github.com/joenahm/MM-Eval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09492v1-abstract-full').style.display = 'none'; document.getElementById('2411.09492v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09422">arXiv:2411.09422</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09422">pdf</a>, <a href="https://arxiv.org/format/2411.09422">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OpenLS-DGF: An Adaptive Open-Source Dataset Generation Framework for Machine Learning Tasks in Logic Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ni%2C+L">Liwei Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Miao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+X">Xingyu Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+X">Xiaoze Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Junfeng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+G">Guojie Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+Z">Zhufei Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+W">Weikang Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xiaoyan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+B">Biwei Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xingquan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Huawei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09422v2-abstract-short" style="display: inline;"> This paper introduces OpenLS-DGF, an adaptive logic synthesis dataset generation framework, to enhance machine learning~(ML) applications within the logic synthesis process. Previous dataset generation flows were tailored for specific tasks or lacked integrated machine learning capabilities. While OpenLS-DGF supports various machine learning tasks by encapsulating the three fundamental steps of lo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09422v2-abstract-full').style.display = 'inline'; document.getElementById('2411.09422v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09422v2-abstract-full" style="display: none;"> This paper introduces OpenLS-DGF, an adaptive logic synthesis dataset generation framework, to enhance machine learning~(ML) applications within the logic synthesis process. Previous dataset generation flows were tailored for specific tasks or lacked integrated machine learning capabilities. While OpenLS-DGF supports various machine learning tasks by encapsulating the three fundamental steps of logic synthesis: Boolean representation, logic optimization, and technology mapping. It preserves the original information in both Verilog and machine-learning-friendly GraphML formats. The verilog files offer semi-customizable capabilities, enabling researchers to insert additional steps and incrementally refine the generated dataset. Furthermore, OpenLS-DGF includes an adaptive circuit engine that facilitates the final dataset management and downstream tasks. The generated OpenLS-D-v1 dataset comprises 46 combinational designs from established benchmarks, totaling over 966,000 Boolean circuits. OpenLS-D-v1 supports integrating new data features, making it more versatile for new challenges. This paper demonstrates the versatility of OpenLS-D-v1 through four distinct downstream tasks: circuit classification, circuit ranking, quality of results (QoR) prediction, and probability prediction. Each task is chosen to represent essential steps of logic synthesis, and the experimental results show the generated dataset from OpenLS-DGF achieves prominent diversity and applicability. The source code and datasets are available at https://github.com/Logic-Factory/ACE/blob/master/OpenLS-DGF/readme.md. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09422v2-abstract-full').style.display = 'none'; document.getElementById('2411.09422v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09349">arXiv:2411.09349</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09349">pdf</a>, <a href="https://arxiv.org/format/2411.09349">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ParaLBench: A Large-Scale Benchmark for Computational Paralinguistics over Acoustic Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zixing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+W">Weixiang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zhongren Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kanglin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yimeng Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+J">Jing Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Runming Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+D">Dong-Yan Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09349v1-abstract-short" style="display: inline;"> Computational paralinguistics (ComParal) aims to develop algorithms and models to automatically detect, analyze, and interpret non-verbal information from speech communication, e. g., emotion, health state, age, and gender. Despite its rapid progress, it heavily depends on sophisticatedly designed models given specific paralinguistic tasks. Thus, the heterogeneity and diversity of ComParal models&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09349v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09349v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09349v1-abstract-full" style="display: none;"> Computational paralinguistics (ComParal) aims to develop algorithms and models to automatically detect, analyze, and interpret non-verbal information from speech communication, e. g., emotion, health state, age, and gender. Despite its rapid progress, it heavily depends on sophisticatedly designed models given specific paralinguistic tasks. Thus, the heterogeneity and diversity of ComParal models largely prevent the realistic implementation of ComParal models. Recently, with the advent of acoustic foundation models because of self-supervised learning, developing more generic models that can efficiently perceive a plethora of paralinguistic information has become an active topic in speech processing. However, it lacks a unified evaluation framework for a fair and consistent performance comparison. To bridge this gap, we conduct a large-scale benchmark, namely ParaLBench, which concentrates on standardizing the evaluation process of diverse paralinguistic tasks, including critical aspects of affective computing such as emotion recognition and emotion dimensions prediction, over different acoustic foundation models. This benchmark contains ten datasets with thirteen distinct paralinguistic tasks, covering short-, medium- and long-term characteristics. Each task is carried out on 14 acoustic foundation models under a unified evaluation framework, which allows for an unbiased methodological comparison and offers a grounded reference for the ComParal community. Based on the insights gained from ParaLBench, we also point out potential research directions, i.e., the cross-corpus generalizability, to propel ComParal research in the future. The code associated with this study will be available to foster the transparency and replicability of this work for succeeding researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09349v1-abstract-full').style.display = 'none'; document.getElementById('2411.09349v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08840">arXiv:2411.08840</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08840">pdf</a>, <a href="https://arxiv.org/format/2411.08840">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Instruction Tuning with Hybrid State Space Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jianing Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Han Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shuai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+N">Ning Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruijie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+X">Xiaohan Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Sheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lingyun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08840v1-abstract-short" style="display: inline;"> Handling lengthy context is crucial for enhancing the recognition and understanding capabilities of multimodal large language models (MLLMs) in applications such as processing high-resolution images or high frame rate videos. The rise in image resolution and frame rate substantially increases computational demands due to the increased number of input tokens. This challenge is further exacerbated b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08840v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08840v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08840v1-abstract-full" style="display: none;"> Handling lengthy context is crucial for enhancing the recognition and understanding capabilities of multimodal large language models (MLLMs) in applications such as processing high-resolution images or high frame rate videos. The rise in image resolution and frame rate substantially increases computational demands due to the increased number of input tokens. This challenge is further exacerbated by the quadratic complexity with respect to sequence length of the self-attention mechanism. Most prior works either pre-train models with long contexts, overlooking the efficiency problem, or attempt to reduce the context length via downsampling (e.g., identify the key image patches or frames) to decrease the context length, which may result in information loss. To circumvent this issue while keeping the remarkable effectiveness of MLLMs, we propose a novel approach using a hybrid transformer-MAMBA model to efficiently handle long contexts in multimodal applications. Our multimodal model can effectively process long context input exceeding 100k tokens, outperforming existing models across various benchmarks. Remarkably, our model enhances inference efficiency for high-resolution images and high-frame-rate videos by about 4 times compared to current models, with efficiency gains increasing as image resolution or video frames rise. Furthermore, our model is the first to be trained on low-resolution images or low-frame-rate videos while being capable of inference on high-resolution images and high-frame-rate videos, offering flexibility for inference in diverse scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08840v1-abstract-full').style.display = 'none'; document.getElementById('2411.08840v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08437">arXiv:2411.08437</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08437">pdf</a>, <a href="https://arxiv.org/format/2411.08437">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Evolutionary Algorithm with Detection Region Method for Constrained Multi-Objective Problems with Binary Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Weixiong Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+S">Sheng Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Ling Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08437v1-abstract-short" style="display: inline;"> Solving constrained multi-objective optimization problems (CMOPs) is a challenging task. While many practical algorithms have been developed to tackle CMOPs, real-world scenarios often present cases where the constraint functions are unknown or unquantifiable, resulting in only binary outcomes (feasible or infeasible). This limitation reduces the effectiveness of constraint violation guidance, whi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08437v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08437v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08437v1-abstract-full" style="display: none;"> Solving constrained multi-objective optimization problems (CMOPs) is a challenging task. While many practical algorithms have been developed to tackle CMOPs, real-world scenarios often present cases where the constraint functions are unknown or unquantifiable, resulting in only binary outcomes (feasible or infeasible). This limitation reduces the effectiveness of constraint violation guidance, which can negatively impact the performance of existing algorithms that rely on this approach. Such challenges are particularly detrimental for algorithms employing the epsilon-based method, as they hinder effective relaxation of the feasible region. To address these challenges, this paper proposes a novel algorithm called DRMCMO based on the detection region method. In DRMCMO, detection regions dynamic monitor feasible solutions to enhance convergence, helping the population escape local optima. Additionally, these regions collaborate with the neighbor pairing strategy to improve population diversity within narrow feasible areas. We have modified three existing test suites to serve as benchmark test problems for CMOPs with binary constraints(CMOP/BC) and conducted comprehensive comparative experiments with state-of-the-art algorithms on these test suites and real-world problems. The results demonstrate the strong competitiveness of DRMCMO against state-of-the-art algorithms. Given the limited research on CMOP/BC, our study offers a new perspective for advancing this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08437v1-abstract-full').style.display = 'none'; document.getElementById('2411.08437v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08232">arXiv:2411.08232</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08232">pdf</a>, <a href="https://arxiv.org/format/2411.08232">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Imitation Learning from Observations: An Autoregressive Mixture of Experts Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Renzi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Acerbo%2C+F+S">Flavia Sofia Acerbo</a>, <a href="/search/cs?searchtype=author&amp;query=Son%2C+T+D">Tong Duy Son</a>, <a href="/search/cs?searchtype=author&amp;query=Patrinos%2C+P">Panagiotis Patrinos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08232v1-abstract-short" style="display: inline;"> This paper presents a novel approach to imitation learning from observations, where an autoregressive mixture of experts model is deployed to fit the underlying policy. The parameters of the model are learned via a two-stage framework. By leveraging the existing dynamics knowledge, the first stage of the framework estimates the control input sequences and hence reduces the problem complexity. At t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08232v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08232v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08232v1-abstract-full" style="display: none;"> This paper presents a novel approach to imitation learning from observations, where an autoregressive mixture of experts model is deployed to fit the underlying policy. The parameters of the model are learned via a two-stage framework. By leveraging the existing dynamics knowledge, the first stage of the framework estimates the control input sequences and hence reduces the problem complexity. At the second stage, the policy is learned by solving a regularized maximum-likelihood estimation problem using the estimated control input sequences. We further extend the learning procedure by incorporating a Lyapunov stability constraint to ensure asymptotic stability of the identified model, for accurate multi-step predictions. The effectiveness of the proposed framework is validated using two autonomous driving datasets collected from human demonstrations, demonstrating its practical applicability in modelling complex nonlinear dynamics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08232v1-abstract-full').style.display = 'none'; document.getElementById('2411.08232v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07598">arXiv:2411.07598</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07598">pdf</a>, <a href="https://arxiv.org/format/2411.07598">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Problem-Oriented Segmentation and Retrieval: Case Study on Tutoring Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R+E">Rose E. Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wirawarn%2C+P">Pawan Wirawarn</a>, <a href="/search/cs?searchtype=author&amp;query=Lam%2C+K">Kenny Lam</a>, <a href="/search/cs?searchtype=author&amp;query=Khattab%2C+O">Omar Khattab</a>, <a href="/search/cs?searchtype=author&amp;query=Demszky%2C+D">Dorottya Demszky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07598v1-abstract-short" style="display: inline;"> Many open-ended conversations (e.g., tutoring lessons or business meetings) revolve around pre-defined reference materials, like worksheets or meeting bullets. To provide a framework for studying such conversation structure, we introduce Problem-Oriented Segmentation &amp; Retrieval (POSR), the task of jointly breaking down conversations into segments and linking each segment to the relevant reference&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07598v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07598v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07598v1-abstract-full" style="display: none;"> Many open-ended conversations (e.g., tutoring lessons or business meetings) revolve around pre-defined reference materials, like worksheets or meeting bullets. To provide a framework for studying such conversation structure, we introduce Problem-Oriented Segmentation &amp; Retrieval (POSR), the task of jointly breaking down conversations into segments and linking each segment to the relevant reference item. As a case study, we apply POSR to education where effectively structuring lessons around problems is critical yet difficult. We present LessonLink, the first dataset of real-world tutoring lessons, featuring 3,500 segments, spanning 24,300 minutes of instruction and linked to 116 SAT math problems. We define and evaluate several joint and independent approaches for POSR, including segmentation (e.g., TextTiling), retrieval (e.g., ColBERT), and large language models (LLMs) methods. Our results highlight that modeling POSR as one joint task is essential: POSR methods outperform independent segmentation and retrieval pipelines by up to +76% on joint metrics and surpass traditional segmentation methods by up to +78% on segmentation metrics. We demonstrate POSR&#39;s practical impact on downstream education applications, deriving new insights on the language and time use in real-world lesson structures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07598v1-abstract-full').style.display = 'none'; document.getElementById('2411.07598v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Findings. Our code and dataset are open-sourced at https://github.com/rosewang2008/posr</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06826">arXiv:2411.06826</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06826">pdf</a>, <a href="https://arxiv.org/format/2411.06826">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Conditional Expert Selection Network for Multi-domain Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+K">Kuiyao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Lou%2C+X">Xingyu Lou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruian Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Wenyi Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Ping Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06826v1-abstract-short" style="display: inline;"> Mixture-of-Experts (MOE) has recently become the de facto standard in Multi-domain recommendation (MDR) due to its powerful expressive ability. However, such MOE-based method typically employs all experts for each instance, leading to scalability issue and low-discriminability between domains and experts. Furthermore, the design of commonly used domain-specific networks exacerbates the scalability&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06826v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06826v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06826v1-abstract-full" style="display: none;"> Mixture-of-Experts (MOE) has recently become the de facto standard in Multi-domain recommendation (MDR) due to its powerful expressive ability. However, such MOE-based method typically employs all experts for each instance, leading to scalability issue and low-discriminability between domains and experts. Furthermore, the design of commonly used domain-specific networks exacerbates the scalability issues. To tackle the problems, We propose a novel method named CESAA consists of Conditional Expert Selection (CES) Module and Adaptive Expert Aggregation (AEA) Module to tackle these challenges. Specifically, CES first combines a sparse gating strategy with domain-shared experts. Then AEA utilizes mutual information loss to strengthen the correlations between experts and specific domains, and significantly improve the distinction between experts. As a result, only domain-shared experts and selected domain-specific experts are activated for each instance, striking a balance between computational efficiency and model performance. Experimental results on both public ranking and industrial retrieval datasets verify the effectiveness of our method in MDR tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06826v1-abstract-full').style.display = 'none'; document.getElementById('2411.06826v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06667">arXiv:2411.06667</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06667">pdf</a>, <a href="https://arxiv.org/format/2411.06667">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> DCF-DS: Deep Cascade Fusion of Diarization and Separation for Speech Recognition under Realistic Single-Channel Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Niu%2C+S">Shu-Tong Niu</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruo-Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+G">Gao-Bin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+T">Tian Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06667v1-abstract-short" style="display: inline;"> We propose a single-channel Deep Cascade Fusion of Diarization and Separation (DCF-DS) framework for back-end speech recognition, combining neural speaker diarization (NSD) and speech separation (SS). First, we sequentially integrate the NSD and SS modules within a joint training framework, enabling the separation module to leverage speaker time boundaries from the diarization module effectively.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06667v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06667v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06667v1-abstract-full" style="display: none;"> We propose a single-channel Deep Cascade Fusion of Diarization and Separation (DCF-DS) framework for back-end speech recognition, combining neural speaker diarization (NSD) and speech separation (SS). First, we sequentially integrate the NSD and SS modules within a joint training framework, enabling the separation module to leverage speaker time boundaries from the diarization module effectively. Then, to complement DCF-DS training, we introduce a window-level decoding scheme that allows the DCF-DS framework to handle the sparse data convergence instability (SDCI) problem. We also explore using an NSD system trained on real datasets to provide more accurate speaker boundaries during decoding. Additionally, we incorporate an optional multi-input multi-output speech enhancement module (MIMO-SE) within the DCF-DS framework, which offers further performance gains. Finally, we enhance diarization results by re-clustering DCF-DS outputs, improving ASR accuracy. By incorporating the DCF-DS method, we achieved first place in the realistic single-channel track of the CHiME-8 NOTSOFAR-1 challenge. We also perform the evaluation on the open LibriCSS dataset, achieving a new state-of-the-art performance on single-channel speech recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06667v1-abstract-full').style.display = 'none'; document.getElementById('2411.06667v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06655">arXiv:2411.06655</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06655">pdf</a>, <a href="https://arxiv.org/format/2411.06655">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Explore the Reasoning Capability of LLMs in the Chess Testbed </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+L">Lei Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Renxi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wenxiao Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Haokun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+Y">Yifan Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y+N">Ying Nian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06655v1-abstract-short" style="display: inline;"> Reasoning is a central capability of human intelligence. In recent years, with the advent of large-scale datasets, pretrained large language models have emerged with new capabilities, including reasoning. However, these models still struggle with long-term, complex reasoning tasks, such as playing chess. Based on the observation that expert chess players employ a dual approach combining long-term&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06655v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06655v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06655v1-abstract-full" style="display: none;"> Reasoning is a central capability of human intelligence. In recent years, with the advent of large-scale datasets, pretrained large language models have emerged with new capabilities, including reasoning. However, these models still struggle with long-term, complex reasoning tasks, such as playing chess. Based on the observation that expert chess players employ a dual approach combining long-term strategic play with short-term tactical play along with language explanation, we propose improving the reasoning capability of large language models in chess by integrating annotated strategy and tactic. Specifically, we collect a dataset named MATE, which consists of 1 million chess positions with candidate moves annotated by chess experts for strategy and tactics. We finetune the LLaMA-3-8B model and compare it against state-of-the-art commercial language models in the task of selecting better chess moves. Our experiments show that our models perform better than GPT, Claude, and Gemini models. We find that language explanations can enhance the reasoning capability of large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06655v1-abstract-full').style.display = 'none'; document.getElementById('2411.06655v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to NAACL2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06374">arXiv:2411.06374</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06374">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Metric Learning for Tag Recommendation: Tackling Data Sparsity and Cold Start Issues </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yuanshuai Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Y">Yaxin Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+A">Ankai Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenyi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06374v1-abstract-short" style="display: inline;"> With the rapid growth of digital information, personalized recommendation systems have become an indispensable part of Internet services, especially in the fields of e-commerce, social media, and online entertainment. However, traditional collaborative filtering and content-based recommendation methods have limitations in dealing with data sparsity and cold start problems, especially in the face o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06374v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06374v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06374v1-abstract-full" style="display: none;"> With the rapid growth of digital information, personalized recommendation systems have become an indispensable part of Internet services, especially in the fields of e-commerce, social media, and online entertainment. However, traditional collaborative filtering and content-based recommendation methods have limitations in dealing with data sparsity and cold start problems, especially in the face of largescale heterogeneous data, which makes it difficult to meet user expectations. This paper proposes a new label recommendation algorithm based on metric learning, which aims to overcome the challenges of traditional recommendation systems by learning effective distance or similarity metrics to capture the subtle differences between user preferences and item features. Experimental results show that the algorithm outperforms baseline methods including local response metric learning (LRML), collaborative metric learning (CML), and adaptive tensor factorization (ATF) based on adversarial learning on multiple evaluation metrics. In particular, it performs particularly well in the accuracy of the first few recommended items, while maintaining high robustness and maintaining high recommendation accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06374v1-abstract-full').style.display = 'none'; document.getElementById('2411.06374v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06096">arXiv:2411.06096</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06096">pdf</a>, <a href="https://arxiv.org/format/2411.06096">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ZhoBLiMP: a Systematic Assessment of Language Models with Linguistic Minimal Pairs in Chinese </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yikang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Y">Yeting Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+H">Hongao Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+L">Lilong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+Z">Zhiheng Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+S">Siyuan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kejia Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jialong Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+P">Pei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Baosong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Hai Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06096v1-abstract-short" style="display: inline;"> Whether and how language models (LMs) acquire the syntax of natural languages has been widely evaluated under the minimal pair paradigm. However, a lack of wide-coverage benchmarks in languages other than English has constrained systematic investigations into the issue. Addressing it, we first introduce ZhoBLiMP, the most comprehensive benchmark of linguistic minimal pairs for Chinese to date, wit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06096v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06096v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06096v1-abstract-full" style="display: none;"> Whether and how language models (LMs) acquire the syntax of natural languages has been widely evaluated under the minimal pair paradigm. However, a lack of wide-coverage benchmarks in languages other than English has constrained systematic investigations into the issue. Addressing it, we first introduce ZhoBLiMP, the most comprehensive benchmark of linguistic minimal pairs for Chinese to date, with 118 paradigms, covering 15 linguistic phenomena. We then train 20 LMs of different sizes (14M to 1.4B) on Chinese corpora of various volumes (100M to 3B tokens) and evaluate them along with 14 off-the-shelf LLMs on ZhoBLiMP. The overall results indicate that Chinese grammar can be mostly learned by models with around 500M parameters, trained on 1B tokens with one epoch, showing limited benefits for further scaling. Most (N=95) linguistic paradigms are of easy or medium difficulty for LMs, while there are still 13 paradigms that remain challenging even for models with up to 32B parameters. In regard to how LMs acquire Chinese grammar, we observe a U-shaped learning pattern in several phenomena, similar to those observed in child language acquisition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06096v1-abstract-full').style.display = 'none'; document.getElementById('2411.06096v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06004">arXiv:2411.06004</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06004">pdf</a>, <a href="https://arxiv.org/format/2411.06004">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Do Data Center Network Metrics Predict Application-Facing Performance? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chang%2C+B">Brian Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Mogul%2C+J+C">Jeffrey C. Mogul</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Mingyang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Akella%2C+A">Aditya Akella</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06004v1-abstract-short" style="display: inline;"> Applications that run in large-scale data center networks (DCNs) rely on the DCN&#39;s ability to deliver application requests in a performant manner. DCNs expose a complex design and operational space, and network designers and operators care how different options along this space affect application performance. One might run controlled experiments and measure the corresponding application-facing per&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06004v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06004v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06004v1-abstract-full" style="display: none;"> Applications that run in large-scale data center networks (DCNs) rely on the DCN&#39;s ability to deliver application requests in a performant manner. DCNs expose a complex design and operational space, and network designers and operators care how different options along this space affect application performance. One might run controlled experiments and measure the corresponding application-facing performance, but such experiments become progressively infeasible at a large scale, and simulations risk yielding inaccurate or incomplete results. Instead, we show that we can predict application-facing performance through more easily measured network metrics. For example, network telemetry metrics (e.g., link utilization) can predict application-facing metrics (e.g., transfer latency). Through large-scale measurements of production networks, we study the correlation between the two types of metrics, and construct predictive, interpretable models that serve as a suggestive guideline to network designers and operators. We show that no single network metric is universally the best predictor (even though some prior work has focused on a single predictor). We found that simple linear models often have the lowest error, while queueing-based models are better in a few cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06004v1-abstract-full').style.display = 'none'; document.getElementById('2411.06004v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 (main body) + 5 (appendix) pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04217">arXiv:2411.04217</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04217">pdf</a>, <a href="https://arxiv.org/format/2411.04217">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Quantum Diffusion Models for Few-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruhan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Ye Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Koike-Akino%2C+T">Toshiaki Koike-Akino</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04217v1-abstract-short" style="display: inline;"> Modern quantum machine learning (QML) methods involve the variational optimization of parameterized quantum circuits on training datasets, followed by predictions on testing datasets. Most state-of-the-art QML algorithms currently lack practical advantages due to their limited learning capabilities, especially in few-shot learning tasks. In this work, we propose three new frameworks employing quan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04217v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04217v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04217v1-abstract-full" style="display: none;"> Modern quantum machine learning (QML) methods involve the variational optimization of parameterized quantum circuits on training datasets, followed by predictions on testing datasets. Most state-of-the-art QML algorithms currently lack practical advantages due to their limited learning capabilities, especially in few-shot learning tasks. In this work, we propose three new frameworks employing quantum diffusion model (QDM) as a solution for the few-shot learning: label-guided generation inference (LGGI); label-guided denoising inference (LGDI); and label-guided noise addition inference (LGNAI). Experimental results demonstrate that our proposed algorithms significantly outperform existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04217v1-abstract-full').style.display = 'none'; document.getElementById('2411.04217v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04042">arXiv:2411.04042</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04042">pdf</a>, <a href="https://arxiv.org/format/2411.04042">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Instance-Optimal Acyclic Join Processing Without Regret: Engineering the Yannakakis Algorithm in Column Stores </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bekkers%2C+L">Liese Bekkers</a>, <a href="/search/cs?searchtype=author&amp;query=Neven%2C+F">Frank Neven</a>, <a href="/search/cs?searchtype=author&amp;query=Vansummeren%2C+S">Stijn Vansummeren</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y+R">Yisu Remy Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04042v1-abstract-short" style="display: inline;"> Acyclic join queries can be evaluated instance-optimally using Yannakakis&#39; algorithm, which avoids needlessly large intermediate results through semi-join passes. Recent work proposes to address the significant hidden constant factors arising from a naive implementation of Yannakakis by decomposing the hash join operator into two suboperators, called Lookup and Expand. In this paper, we present a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04042v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04042v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04042v1-abstract-full" style="display: none;"> Acyclic join queries can be evaluated instance-optimally using Yannakakis&#39; algorithm, which avoids needlessly large intermediate results through semi-join passes. Recent work proposes to address the significant hidden constant factors arising from a naive implementation of Yannakakis by decomposing the hash join operator into two suboperators, called Lookup and Expand. In this paper, we present a novel method for integrating Lookup and Expand plans in interpreted environments, like column stores, formalizing them using Nested Semijoin Algebra (NSA) and implementing them through a shredding approach. We characterize the class of NSA expressions that can be evaluated instance-optimally as those that are 2-phase: no `shrinking&#39; operator is applied after an unnest (i.e., expand). We introduce Shredded Yannakakis (SYA), an evaluation algorithm for acyclic joins that, starting from a binary join plan, transforms it into a 2-phase NSA plan, and then evaluates it through the shredding technique. We show that SYA is provably robust (i.e., never produces large intermediate results) and without regret (i.e., is never worse than the binary join plan under a suitable cost model) on the class of well-behaved binary join plans. Our experiments on a suite of 1,849 queries show that SYA improves performance for 88.7% of the queries with speedups up to 188x, while remaining competitive on the other queries. We hope this approach offers a fresh perspective on Yannakakis&#39; algorithm, helping system engineers better understand its practical benefits and facilitating its adoption into a broader spectrum of query engines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04042v1-abstract-full').style.display = 'none'; document.getElementById('2411.04042v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03637">arXiv:2411.03637</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03637">pdf</a>, <a href="https://arxiv.org/format/2411.03637">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Structure Consistent Gaussian Splatting with Matching Prior for Few-shot Novel View Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+R">Rui Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+W">Wangze Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+L">Luyang Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+L">Liwei Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Jiao%2C+J">Jianbo Jiao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ronggang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03637v1-abstract-short" style="display: inline;"> Despite the substantial progress of novel view synthesis, existing methods, either based on the Neural Radiance Fields (NeRF) or more recently 3D Gaussian Splatting (3DGS), suffer significant degradation when the input becomes sparse. Numerous efforts have been introduced to alleviate this problem, but they still struggle to synthesize satisfactory results efficiently, especially in the large scen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03637v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03637v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03637v1-abstract-full" style="display: none;"> Despite the substantial progress of novel view synthesis, existing methods, either based on the Neural Radiance Fields (NeRF) or more recently 3D Gaussian Splatting (3DGS), suffer significant degradation when the input becomes sparse. Numerous efforts have been introduced to alleviate this problem, but they still struggle to synthesize satisfactory results efficiently, especially in the large scene. In this paper, we propose SCGaussian, a Structure Consistent Gaussian Splatting method using matching priors to learn 3D consistent scene structure. Considering the high interdependence of Gaussian attributes, we optimize the scene structure in two folds: rendering geometry and, more importantly, the position of Gaussian primitives, which is hard to be directly constrained in the vanilla 3DGS due to the non-structure property. To achieve this, we present a hybrid Gaussian representation. Besides the ordinary non-structure Gaussian primitives, our model also consists of ray-based Gaussian primitives that are bound to matching rays and whose optimization of their positions is restricted along the ray. Thus, we can utilize the matching correspondence to directly enforce the position of these Gaussian primitives to converge to the surface points where rays intersect. Extensive experiments on forward-facing, surrounding, and complex large scenes show the effectiveness of our approach with state-of-the-art performance and high efficiency. Code is available at https://github.com/prstrive/SCGaussian. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03637v1-abstract-full').style.display = 'none'; document.getElementById('2411.03637v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03042">arXiv:2411.03042</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03042">pdf</a>, <a href="https://arxiv.org/format/2411.03042">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Predictor-Corrector Enhanced Transformers with Exponential Moving Average Coefficient Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+T">Tong Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiahao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Q">Qingyan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+J">Junliang Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jingbo Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+X">Xunliang Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03042v1-abstract-short" style="display: inline;"> Residual networks, as discrete approximations of Ordinary Differential Equations (ODEs), have inspired significant advancements in neural network design, including multistep methods, high-order methods, and multi-particle dynamical systems. The precision of the solution to ODEs significantly affects parameter optimization, thereby impacting model performance. In this work, we present a series of a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03042v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03042v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03042v1-abstract-full" style="display: none;"> Residual networks, as discrete approximations of Ordinary Differential Equations (ODEs), have inspired significant advancements in neural network design, including multistep methods, high-order methods, and multi-particle dynamical systems. The precision of the solution to ODEs significantly affects parameter optimization, thereby impacting model performance. In this work, we present a series of advanced explorations of Transformer architecture design to minimize the error compared to the true ``solution.&#39;&#39; First, we introduce a predictor-corrector learning framework to minimize truncation errors, which consists of a high-order predictor and a multistep corrector. Second, we propose an exponential moving average-based coefficient learning method to strengthen our higher-order predictor. Extensive experiments on large-scale machine translation, abstractive summarization, language modeling, and natural language understanding benchmarks demonstrate the superiority of our approach. On the WMT&#39;14 English-German and English-French tasks, our model achieved BLEU scores of 30.95 and 44.27, respectively. Furthermore, on the OPUS multilingual machine translation task, our model surpasses a robust 3.8B DeepNet by an average of 2.9 SacreBLEU, using only 1/3 parameters. Notably, it also beats LLama models by 5.7 accuracy points on the LM Harness Evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03042v1-abstract-full').style.display = 'none'; document.getElementById('2411.03042v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02465">arXiv:2411.02465</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02465">pdf</a>, <a href="https://arxiv.org/format/2411.02465">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> See it, Think it, Sorted: Large Multimodal Models are Few-shot Time Series Anomaly Analyzers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+J">Jiaxin Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+L">Leon Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhenwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruiqi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+Y">Yuantao Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02465v1-abstract-short" style="display: inline;"> Time series anomaly detection (TSAD) is becoming increasingly vital due to the rapid growth of time series data across various sectors. Anomalies in web service data, for example, can signal critical incidents such as system failures or server malfunctions, necessitating timely detection and response. However, most existing TSAD methodologies rely heavily on manual feature engineering or require e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02465v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02465v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02465v1-abstract-full" style="display: none;"> Time series anomaly detection (TSAD) is becoming increasingly vital due to the rapid growth of time series data across various sectors. Anomalies in web service data, for example, can signal critical incidents such as system failures or server malfunctions, necessitating timely detection and response. However, most existing TSAD methodologies rely heavily on manual feature engineering or require extensive labeled training data, while also offering limited interpretability. To address these challenges, we introduce a pioneering framework called the Time Series Anomaly Multimodal Analyzer (TAMA), which leverages the power of Large Multimodal Models (LMMs) to enhance both the detection and interpretation of anomalies in time series data. By converting time series into visual formats that LMMs can efficiently process, TAMA leverages few-shot in-context learning capabilities to reduce dependence on extensive labeled datasets. Our methodology is validated through rigorous experimentation on multiple real-world datasets, where TAMA consistently outperforms state-of-the-art methods in TSAD tasks. Additionally, TAMA provides rich, natural language-based semantic analysis, offering deeper insights into the nature of detected anomalies. Furthermore, we contribute one of the first open-source datasets that includes anomaly detection labels, anomaly type labels, and contextual description, facilitating broader exploration and advancement within this critical field. Ultimately, TAMA not only excels in anomaly detection but also provides a comprehensive approach for understanding the underlying causes of anomalies, pushing TSAD forward through innovative methodologies and insights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02465v1-abstract-full').style.display = 'none'; document.getElementById('2411.02465v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02353">arXiv:2411.02353</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02353">pdf</a>, <a href="https://arxiv.org/format/2411.02353">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Social-RAG: Retrieving from Group Interactions to Socially Ground Proactive AI Generation to Group Preferences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruotong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xinyi Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+L">Lin Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+J+C">Joseph Chee Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Bragg%2C+J">Jonathan Bragg</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+A+X">Amy X. Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02353v1-abstract-short" style="display: inline;"> AI agents are increasingly tasked with making proactive suggestions in online spaces where groups collaborate, but can be unhelpful or even annoying, due to not fitting the group&#39;s preferences or behaving in socially inappropriate ways. Fortunately, group spaces have a rich history of prior social interactions and affordances for social feedback to support creating agents that align to a group&#39;s i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02353v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02353v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02353v1-abstract-full" style="display: none;"> AI agents are increasingly tasked with making proactive suggestions in online spaces where groups collaborate, but can be unhelpful or even annoying, due to not fitting the group&#39;s preferences or behaving in socially inappropriate ways. Fortunately, group spaces have a rich history of prior social interactions and affordances for social feedback to support creating agents that align to a group&#39;s interests and norms. We present Social-RAG, a workflow for grounding agents to social information about a group, which retrieves from prior group interactions, selects relevant social signals, and then feeds the context into a large language model to generate messages to the group. We implement this into PaperPing, our system that posts academic paper recommendations in group chat, leveraging social signals determined from formative studies with 39 researchers. From a three-month deployment in 18 channels, we observed PaperPing posted relevant messages in groups without disrupting their existing social practices, fostering group common ground. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02353v1-abstract-full').style.display = 'none'; document.getElementById('2411.02353v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&amp;query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&amp;query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large&#39;s superior performance across various benchmarks including language understanding and generation, logica&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large&#39;s superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01792">arXiv:2411.01792</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01792">pdf</a>, <a href="https://arxiv.org/format/2411.01792">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fast Semi-supervised Learning on Large Graphs: An Improved Green-function Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+F">Feiping Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yitao Song</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+W">Wei Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xuelong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01792v1-abstract-short" style="display: inline;"> In the graph-based semi-supervised learning, the Green-function method is a classical method that works by computing the Green&#39;s function in the graph space. However, when applied to large graphs, especially those sparse ones, this method performs unstably and unsatisfactorily. We make a detailed analysis on it and propose a novel method from the perspective of optimization. On fully connected gra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01792v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01792v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01792v1-abstract-full" style="display: none;"> In the graph-based semi-supervised learning, the Green-function method is a classical method that works by computing the Green&#39;s function in the graph space. However, when applied to large graphs, especially those sparse ones, this method performs unstably and unsatisfactorily. We make a detailed analysis on it and propose a novel method from the perspective of optimization. On fully connected graphs, the method is equivalent to the Green-function method and can be seen as another interpretation with physical meanings, while on non-fully connected graphs, it helps to explain why the Green-function method causes a mess on large sparse graphs. To solve this dilemma, we propose a workable approach to improve our proposed method. Unlike the original method, our improved method can also apply two accelerating techniques, Gaussian Elimination, and Anchored Graphs to become more efficient on large graphs. Finally, the extensive experiments prove our conclusions and the efficiency, accuracy, and stability of our improved Green&#39;s function method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01792v1-abstract-full').style.display = 'none'; document.getElementById('2411.01792v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01780">arXiv:2411.01780</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01780">pdf</a>, <a href="https://arxiv.org/format/2411.01780">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Clustering Based on Density Propagation and Subcluster Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nie%2C+F">Feiping Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yitao Song</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+J">Jingjing Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xuelong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01780v1-abstract-short" style="display: inline;"> We propose the DPSM method, a density-based node clustering approach that automatically determines the number of clusters and can be applied in both data space and graph space. Unlike traditional density-based clustering methods, which necessitate calculating the distance between any two nodes, our proposed technique determines density through a propagation process, thereby making it suitable for&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01780v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01780v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01780v1-abstract-full" style="display: none;"> We propose the DPSM method, a density-based node clustering approach that automatically determines the number of clusters and can be applied in both data space and graph space. Unlike traditional density-based clustering methods, which necessitate calculating the distance between any two nodes, our proposed technique determines density through a propagation process, thereby making it suitable for a graph space. In DPSM, nodes are partitioned into small clusters based on propagated density. The partitioning technique has been proved to be sound and complete. We then extend the concept of spectral clustering from individual nodes to these small clusters, while introducing the CluCut measure to guide cluster merging. This measure is modified in various ways to account for cluster properties, thus provides guidance on when to terminate the merging process. Various experiments have validated the effectiveness of DOSM and the accuracy of these conclusions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01780v1-abstract-full').style.display = 'none'; document.getElementById('2411.01780v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00904">arXiv:2411.00904</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00904">pdf</a>, <a href="https://arxiv.org/format/2411.00904">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Similarity and Dissimilarity Guided Co-association Matrix Construction for Ensemble Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+Y">Yuheng Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+M">Mofei Song</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ran Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00904v1-abstract-short" style="display: inline;"> Ensemble clustering aggregates multiple weak clusterings to achieve a more accurate and robust consensus result. The Co-Association matrix (CA matrix) based method is the mainstream ensemble clustering approach that constructs the similarity relationships between sample pairs according the weak clustering partitions to generate the final clustering result. However, the existing methods neglect tha&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00904v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00904v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00904v1-abstract-full" style="display: none;"> Ensemble clustering aggregates multiple weak clusterings to achieve a more accurate and robust consensus result. The Co-Association matrix (CA matrix) based method is the mainstream ensemble clustering approach that constructs the similarity relationships between sample pairs according the weak clustering partitions to generate the final clustering result. However, the existing methods neglect that the quality of cluster is related to its size, i.e., a cluster with smaller size tends to higher accuracy. Moreover, they also do not consider the valuable dissimilarity information in the base clusterings which can reflect the varying importance of sample pairs that are completely disconnected. To this end, we propose the Similarity and Dissimilarity Guided Co-association matrix (SDGCA) to achieve ensemble clustering. First, we introduce normalized ensemble entropy to estimate the quality of each cluster, and construct a similarity matrix based on this estimation. Then, we employ the random walk to explore high-order proximity of base clusterings to construct a dissimilarity matrix. Finally, the adversarial relationship between the similarity matrix and the dissimilarity matrix is utilized to construct a promoted CA matrix for ensemble clustering. We compared our method with 13 state-of-the-art methods across 12 datasets, and the results demonstrated the superiority clustering ability and robustness of the proposed approach. The code is available at https://github.com/xuz2019/SDGCA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00904v1-abstract-full').style.display = 'none'; document.getElementById('2411.00904v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00827">arXiv:2411.00827</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00827">pdf</a>, <a href="https://arxiv.org/format/2411.00827">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> IDEATOR: Jailbreaking Large Vision-Language Models Using Themselves </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruofan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiaosen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xingjun Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00827v2-abstract-short" style="display: inline;"> As large Vision-Language Models (VLMs) grow in prominence, ensuring their safe deployment has become critical. Recent studies have explored VLM robustness against jailbreak attacks--techniques that exploit model vulnerabilities to elicit harmful outputs. However, the limited availability of diverse multi-modal data has led current approaches to rely heavily on adversarial or manually crafted image&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00827v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00827v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00827v2-abstract-full" style="display: none;"> As large Vision-Language Models (VLMs) grow in prominence, ensuring their safe deployment has become critical. Recent studies have explored VLM robustness against jailbreak attacks--techniques that exploit model vulnerabilities to elicit harmful outputs. However, the limited availability of diverse multi-modal data has led current approaches to rely heavily on adversarial or manually crafted images derived from harmful text datasets, which may lack effectiveness and diversity across different contexts. In this paper, we propose a novel jailbreak method named IDEATOR, which autonomously generates malicious image-text pairs for black-box jailbreak attacks. IDEATOR is based on the insight that VLMs themselves could serve as powerful red team models for generating multimodal jailbreak prompts. Specifically, IDEATOR uses a VLM to create targeted jailbreak texts and pairs them with jailbreak images generated by a state-of-the-art diffusion model. Our extensive experiments demonstrate IDEATOR&#39;s high effectiveness and transferability. Notably, it achieves a 94% success rate in jailbreaking MiniGPT-4 with an average of only 5.34 queries, and high success rates of 82%, 88%, and 75% when transferred to LLaVA, InstructBLIP, and Meta&#39;s Chameleon, respectively. IDEATOR uncovers specific vulnerabilities in VLMs under black-box conditions, underscoring the need for improved safety mechanisms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00827v2-abstract-full').style.display = 'none'; document.getElementById('2411.00827v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00689">arXiv:2411.00689</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00689">pdf</a>, <a href="https://arxiv.org/format/2411.00689">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Multi-Source Retrieval-Augmented Generation via Synergizing Reasoning and Preference-Driven Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Q">Qingfei Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruobing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zha%2C+D">Daren Zha</a>, <a href="/search/cs?searchtype=author&amp;query=Mu%2C+N">Nan Mu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00689v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) has emerged as a reliable external knowledge augmentation technique to mitigate hallucination issues and parameterized knowledge limitations in Large Language Models (LLMs). Existing Adaptive RAG (ARAG) systems struggle to effectively explore multiple retrieval sources due to their inability to select the right source at the right time. To address this, we prop&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00689v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00689v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00689v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) has emerged as a reliable external knowledge augmentation technique to mitigate hallucination issues and parameterized knowledge limitations in Large Language Models (LLMs). Existing Adaptive RAG (ARAG) systems struggle to effectively explore multiple retrieval sources due to their inability to select the right source at the right time. To address this, we propose a multi-source ARAG framework, termed MSPR, which synergizes reasoning and preference-driven retrieval to adaptive decide &#34;when and what to retrieve&#34; and &#34;which retrieval source to use&#34;. To better adapt to retrieval sources of differing characteristics, we also employ retrieval action adjustment and answer feedback strategy. They enable our framework to fully explore the high-quality primary source while supplementing it with secondary sources at the right time. Extensive and multi-dimensional experiments conducted on three datasets demonstrate the superiority and effectiveness of MSPR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00689v1-abstract-full').style.display = 'none'; document.getElementById('2411.00689v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00430">arXiv:2411.00430</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00430">pdf</a>, <a href="https://arxiv.org/format/2411.00430">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Class Incremental Learning with Task-Specific Batch Normalization and Out-of-Distribution Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xuchen Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+Y">Yiqiao Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+R">Run Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+W">Weishi Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruixuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00430v1-abstract-short" style="display: inline;"> This study focuses on incremental learning for image classification, exploring how to reduce catastrophic forgetting of all learned knowledge when access to old data is restricted due to memory or privacy constraints. The challenge of incremental learning lies in achieving an optimal balance between plasticity, the ability to learn new knowledge, and stability, the ability to retain old knowledge.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00430v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00430v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00430v1-abstract-full" style="display: none;"> This study focuses on incremental learning for image classification, exploring how to reduce catastrophic forgetting of all learned knowledge when access to old data is restricted due to memory or privacy constraints. The challenge of incremental learning lies in achieving an optimal balance between plasticity, the ability to learn new knowledge, and stability, the ability to retain old knowledge. Based on whether the task identifier (task-ID) of an image can be obtained during the test stage, incremental learning for image classifcation is divided into two main paradigms, which are task incremental learning (TIL) and class incremental learning (CIL). The TIL paradigm has access to the task-ID, allowing it to use multiple task-specific classification heads selected based on the task-ID. Consequently, in CIL, where the task-ID is unavailable, TIL methods must predict the task-ID to extend their application to the CIL paradigm. Our previous method for TIL adds task-specific batch normalization and classification heads incrementally. This work extends the method by predicting task-ID through an &#34;unknown&#34; class added to each classification head. The head with the lowest &#34;unknown&#34; probability is selected, enabling task-ID prediction and making the method applicable to CIL. The task-specific batch normalization (BN) modules effectively adjust the distribution of output feature maps across different tasks, enhancing the model&#39;s plasticity.Moreover, since BN has much fewer parameters compared to convolutional kernels, by only modifying the BN layers as new tasks arrive, the model can effectively manage parameter growth while ensuring stability across tasks. The innovation of this study lies in the first-time introduction of task-specific BN into CIL and verifying the feasibility of extending TIL methods to CIL through task-ID prediction with state-of-the-art performance on multiple datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00430v1-abstract-full').style.display = 'none'; document.getElementById('2411.00430v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures, 4 tables, in submission to IEEE Transaction of Multimedia Journal (TMM)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> F.2.2; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23703">arXiv:2410.23703</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23703">pdf</a>, <a href="https://arxiv.org/format/2410.23703">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OCEAN: Offline Chain-of-thought Evaluation and Alignment in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Junda Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xintong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+Y">Yu Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Y">Yuxin Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jianing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Kveton%2C+B">Branislav Kveton</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+L">Lina Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Shang%2C+J">Jingbo Shang</a>, <a href="/search/cs?searchtype=author&amp;query=McAuley%2C+J">Julian McAuley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23703v1-abstract-short" style="display: inline;"> Offline evaluation of LLMs is crucial in understanding their capacities, though current methods remain underexplored in existing research. In this work, we focus on the offline evaluation of the chain-of-thought capabilities and show how to optimize LLMs based on the proposed evaluation method. To enable offline feedback with rich knowledge and reasoning paths, we use knowledge graphs (e.g., Wikid&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23703v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23703v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23703v1-abstract-full" style="display: none;"> Offline evaluation of LLMs is crucial in understanding their capacities, though current methods remain underexplored in existing research. In this work, we focus on the offline evaluation of the chain-of-thought capabilities and show how to optimize LLMs based on the proposed evaluation method. To enable offline feedback with rich knowledge and reasoning paths, we use knowledge graphs (e.g., Wikidata5m) to provide feedback on the generated chain of thoughts. Due to the heterogeneity between LLM reasoning and KG structures, direct interaction and feedback from KGs on LLM behavior are challenging, as they require accurate entity linking and grounding of LLM-generated chains of thought in the KG. To address the above challenge, we propose an offline chain-of-thought evaluation framework, OCEAN, which models chain-of-thought reasoning in LLMs as an MDP and evaluate the policy&#39;s alignment with KG preference modeling. To overcome the reasoning heterogeneity and grounding problems, we leverage on-policy KG exploration and RL to model a KG policy that generates token-level likelihood distributions for LLM-generated chain-of-thought reasoning paths, simulating KG reasoning preference. Then we incorporate the knowledge-graph feedback on the validity and alignment of the generated reasoning paths into inverse propensity scores and propose KG-IPS estimator. Theoretically, we prove the unbiasedness of the proposed KG-IPS estimator and provide a lower bound on its variance. With the off-policy evaluated value function, we can directly enable off-policy optimization to further enhance chain-of-thought alignment. Our empirical study shows that OCEAN can be efficiently optimized for generating chain-of-thought reasoning paths with higher estimated values without affecting LLMs&#39; general abilities in downstream tasks or their internal knowledge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23703v1-abstract-full').style.display = 'none'; document.getElementById('2410.23703v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23496">arXiv:2410.23496</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23496">pdf</a>, <a href="https://arxiv.org/format/2410.23496">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Smaller Large Language Models Can Do Moral Self-Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+G">Guangliang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+Z">Zhiyu Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rongrong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Johnson%2C+K+M">Kristen Marie Johnson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23496v1-abstract-short" style="display: inline;"> Self-correction is one of the most amazing emerging capabilities of Large Language Models (LLMs), enabling LLMs to self-modify an inappropriate output given a natural language feedback which describes the problems of that output. Moral self-correction is a post-hoc approach correcting unethical generations without requiring a gradient update, making it both computationally lightweight and capable&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23496v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23496v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23496v1-abstract-full" style="display: none;"> Self-correction is one of the most amazing emerging capabilities of Large Language Models (LLMs), enabling LLMs to self-modify an inappropriate output given a natural language feedback which describes the problems of that output. Moral self-correction is a post-hoc approach correcting unethical generations without requiring a gradient update, making it both computationally lightweight and capable of preserving the language modeling ability. Previous works have shown that LLMs can self-debias, and it has been reported that small models, i.e., those with less than 22B parameters, are not capable of moral self-correction. However, there is no direct proof as to why such smaller models fall short of moral self-correction, though previous research hypothesizes that larger models are skilled in following instructions and understanding abstract social norms. In this paper, we empirically validate this hypothesis in the context of social stereotyping, through meticulous prompting. Our experimental results indicate that (i) surprisingly, 3.8B LLMs with proper safety alignment fine-tuning can achieve very good moral self-correction performance, highlighting the significant effects of safety alignment; and (ii) small LLMs are indeed weaker than larger-scale models in terms of comprehending social norms and self-explanation through CoT, but all scales of LLMs show bad self-correction performance given unethical instructions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23496v1-abstract-full').style.display = 'none'; document.getElementById('2410.23496v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23450">arXiv:2410.23450</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23450">pdf</a>, <a href="https://arxiv.org/format/2410.23450">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Return Augmented Decision Transformer for Off-Dynamics Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruhan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhishuai Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+D">Dongruo Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+P">Pan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23450v1-abstract-short" style="display: inline;"> We study offline off-dynamics reinforcement learning (RL) to utilize data from an easily accessible source domain to enhance policy learning in a target domain with limited data. Our approach centers on return-conditioned supervised learning (RCSL), particularly focusing on the decision transformer (DT), which can predict actions conditioned on desired return guidance and complete trajectory histo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23450v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23450v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23450v1-abstract-full" style="display: none;"> We study offline off-dynamics reinforcement learning (RL) to utilize data from an easily accessible source domain to enhance policy learning in a target domain with limited data. Our approach centers on return-conditioned supervised learning (RCSL), particularly focusing on the decision transformer (DT), which can predict actions conditioned on desired return guidance and complete trajectory history. Previous works tackle the dynamics shift problem by augmenting the reward in the trajectory from the source domain to match the optimal trajectory in the target domain. However, this strategy can not be directly applicable in RCSL owing to (1) the unique form of the RCSL policy class, which explicitly depends on the return, and (2) the absence of a straightforward representation of the optimal trajectory distribution. We propose the Return Augmented Decision Transformer (RADT) method, where we augment the return in the source domain by aligning its distribution with that in the target domain. We provide the theoretical analysis demonstrating that the RCSL policy learned from RADT achieves the same level of suboptimality as would be obtained without a dynamics shift. We introduce two practical implementations RADT-DARA and RADT-MV respectively. Extensive experiments conducted on D4RL datasets reveal that our methods generally outperform dynamic programming based methods in off-dynamics RL scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23450v1-abstract-full').style.display = 'none'; document.getElementById('2410.23450v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 10 tables, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22959">arXiv:2410.22959</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22959">pdf</a>, <a href="https://arxiv.org/format/2410.22959">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EnsIR: An Ensemble Algorithm for Image Restoration via Gaussian Mixture Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+S">Shangquan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+W">Wenqi Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zikun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+H">Hyunhee Park</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22959v1-abstract-short" style="display: inline;"> Image restoration has experienced significant advancements due to the development of deep learning. Nevertheless, it encounters challenges related to ill-posed problems, resulting in deviations between single model predictions and ground-truths. Ensemble learning, as a powerful machine learning technique, aims to address these deviations by combining the predictions of multiple base models. Most e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22959v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22959v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22959v1-abstract-full" style="display: none;"> Image restoration has experienced significant advancements due to the development of deep learning. Nevertheless, it encounters challenges related to ill-posed problems, resulting in deviations between single model predictions and ground-truths. Ensemble learning, as a powerful machine learning technique, aims to address these deviations by combining the predictions of multiple base models. Most existing works adopt ensemble learning during the design of restoration models, while only limited research focuses on the inference-stage ensemble of pre-trained restoration models. Regression-based methods fail to enable efficient inference, leading researchers in academia and industry to prefer averaging as their choice for post-training ensemble. To address this, we reformulate the ensemble problem of image restoration into Gaussian mixture models (GMMs) and employ an expectation maximization (EM)-based algorithm to estimate ensemble weights for aggregating prediction candidates. We estimate the range-wise ensemble weights on a reference set and store them in a lookup table (LUT) for efficient ensemble inference on the test set. Our algorithm is model-agnostic and training-free, allowing seamless integration and enhancement of various pre-trained image restoration models. It consistently outperforms regression based methods and averaging ensemble approaches on 14 benchmarks across 3 image restoration tasks, including super-resolution, deblurring and deraining. The codes and all estimated weights have been released in Github. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22959v1-abstract-full').style.display = 'none'; document.getElementById('2410.22959v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages for main manuscript, additional 17 pages for appendix, 18 figures, 17MB</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22258">arXiv:2410.22258</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22258">pdf</a>, <a href="https://arxiv.org/format/2410.22258">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> LipKernel: Lipschitz-Bounded Convolutional Neural Networks via Dissipative Layers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pauli%2C+P">Patricia Pauli</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruigang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Manchester%2C+I">Ian Manchester</a>, <a href="/search/cs?searchtype=author&amp;query=Allg%C3%B6wer%2C+F">Frank Allg枚wer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22258v1-abstract-short" style="display: inline;"> We propose a novel layer-wise parameterization for convolutional neural networks (CNNs) that includes built-in robustness guarantees by enforcing a prescribed Lipschitz bound. Each layer in our parameterization is designed to satisfy a linear matrix inequality (LMI), which in turn implies dissipativity with respect to a specific supply rate. Collectively, these layer-wise LMIs ensure Lipschitz bou&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22258v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22258v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22258v1-abstract-full" style="display: none;"> We propose a novel layer-wise parameterization for convolutional neural networks (CNNs) that includes built-in robustness guarantees by enforcing a prescribed Lipschitz bound. Each layer in our parameterization is designed to satisfy a linear matrix inequality (LMI), which in turn implies dissipativity with respect to a specific supply rate. Collectively, these layer-wise LMIs ensure Lipschitz boundedness for the input-output mapping of the neural network, yielding a more expressive parameterization than through spectral bounds or orthogonal layers. Our new method LipKernel directly parameterizes dissipative convolution kernels using a 2-D Roesser-type state space model. This means that the convolutional layers are given in standard form after training and can be evaluated without computational overhead. In numerical experiments, we show that the run-time using our method is orders of magnitude faster than state-of-the-art Lipschitz-bounded networks that parameterize convolutions in the Fourier domain, making our approach particularly attractive for improving robustness of learning-based real-time perception or control in robotics, autonomous vehicles, or automation systems. We focus on CNNs, and in contrast to previous works, our approach accommodates a wide variety of layers typically used in CNNs, including 1-D and 2-D convolutional layers, maximum and average pooling layers, as well as strided and dilated convolutions and zero padding. However, our approach naturally extends beyond CNNs as we can incorporate any layer that is incrementally dissipative. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22258v1-abstract-full').style.display = 'none'; document.getElementById('2410.22258v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22059">arXiv:2410.22059</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22059">pdf</a>, <a href="https://arxiv.org/format/2410.22059">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PACA: Perspective-Aware Cross-Attention Representation for Zero-Shot Scene Rearrangement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jin%2C+S">Shutong Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruiyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+K">Kuangyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pokorny%2C+F+T">Florian T. Pokorny</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22059v1-abstract-short" style="display: inline;"> Scene rearrangement, like table tidying, is a challenging task in robotic manipulation due to the complexity of predicting diverse object arrangements. Web-scale trained generative models such as Stable Diffusion can aid by generating natural scenes as goals. To facilitate robot execution, object-level representations must be extracted to match the real scenes with the generated goals and to calcu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22059v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22059v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22059v1-abstract-full" style="display: none;"> Scene rearrangement, like table tidying, is a challenging task in robotic manipulation due to the complexity of predicting diverse object arrangements. Web-scale trained generative models such as Stable Diffusion can aid by generating natural scenes as goals. To facilitate robot execution, object-level representations must be extracted to match the real scenes with the generated goals and to calculate object pose transformations. Current methods typically use a multi-step design that involves separate models for generation, segmentation, and feature encoding, which can lead to a low success rate due to error accumulation. Furthermore, they lack control over the viewing perspectives of the generated goals, restricting the tasks to 3-DoF settings. In this paper, we propose PACA, a zero-shot pipeline for scene rearrangement that leverages perspective-aware cross-attention representation derived from Stable Diffusion. Specifically, we develop a representation that integrates generation, segmentation, and feature encoding into a single step to produce object-level representations. Additionally, we introduce perspective control, thus enabling the matching of 6-DoF camera views and extending past approaches that were limited to 3-DoF top-down views. The efficacy of our method is demonstrated through its zero-shot performance in real robot experiments across various scenes, achieving an average matching accuracy and execution success rate of 87% and 67%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22059v1-abstract-full').style.display = 'none'; document.getElementById('2410.22059v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21276">arXiv:2410.21276</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21276">pdf</a>, <a href="https://arxiv.org/format/2410.21276">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GPT-4o System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&amp;query=%3A"> :</a>, <a href="/search/cs?searchtype=author&amp;query=Hurst%2C+A">Aaron Hurst</a>, <a href="/search/cs?searchtype=author&amp;query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&amp;query=Goucher%2C+A+P">Adam P. Goucher</a>, <a href="/search/cs?searchtype=author&amp;query=Perelman%2C+A">Adam Perelman</a>, <a href="/search/cs?searchtype=author&amp;query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/cs?searchtype=author&amp;query=Clark%2C+A">Aidan Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Ostrow%2C+A">AJ Ostrow</a>, <a href="/search/cs?searchtype=author&amp;query=Welihinda%2C+A">Akila Welihinda</a>, <a href="/search/cs?searchtype=author&amp;query=Hayes%2C+A">Alan Hayes</a>, <a href="/search/cs?searchtype=author&amp;query=Radford%2C+A">Alec Radford</a>, <a href="/search/cs?searchtype=author&amp;query=M%C4%85dry%2C+A">Aleksander M膮dry</a>, <a href="/search/cs?searchtype=author&amp;query=Baker-Whitcomb%2C+A">Alex Baker-Whitcomb</a>, <a href="/search/cs?searchtype=author&amp;query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&amp;query=Borzunov%2C+A">Alex Borzunov</a>, <a href="/search/cs?searchtype=author&amp;query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&amp;query=Chow%2C+A">Alex Chow</a>, <a href="/search/cs?searchtype=author&amp;query=Kirillov%2C+A">Alex Kirillov</a>, <a href="/search/cs?searchtype=author&amp;query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/cs?searchtype=author&amp;query=Paino%2C+A">Alex Paino</a>, <a href="/search/cs?searchtype=author&amp;query=Renzin%2C+A">Alex Renzin</a>, <a href="/search/cs?searchtype=author&amp;query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&amp;query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/cs?searchtype=author&amp;query=Christakis%2C+A">Alexi Christakis</a> , et al. (395 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21276v1-abstract-short" style="display: inline;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It&#39;s trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 mil&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21276v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21276v1-abstract-full" style="display: none;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It&#39;s trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o&#39;s capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we&#39;ve implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o&#39;s text and vision capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'none'; document.getElementById('2410.21276v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21127">arXiv:2410.21127</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21127">pdf</a>, <a href="https://arxiv.org/format/2410.21127">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Retrieval-Enhanced Mutation Mastery: Augmenting Zero-Shot Prediction of Protein Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Y">Yang Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruilin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+B">Banghao Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bingxin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21127v1-abstract-short" style="display: inline;"> Enzyme engineering enables the modification of wild-type proteins to meet industrial and research demands by enhancing catalytic activity, stability, binding affinities, and other properties. The emergence of deep learning methods for protein modeling has demonstrated superior results at lower costs compared to traditional approaches such as directed evolution and rational design. In mutation effe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21127v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21127v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21127v1-abstract-full" style="display: none;"> Enzyme engineering enables the modification of wild-type proteins to meet industrial and research demands by enhancing catalytic activity, stability, binding affinities, and other properties. The emergence of deep learning methods for protein modeling has demonstrated superior results at lower costs compared to traditional approaches such as directed evolution and rational design. In mutation effect prediction, the key to pre-training deep learning models lies in accurately interpreting the complex relationships among protein sequence, structure, and function. This study introduces a retrieval-enhanced protein language model for comprehensive analysis of native properties from sequence and local structural interactions, as well as evolutionary properties from retrieved homologous sequences. The state-of-the-art performance of the proposed ProtREM is validated on over 2 million mutants across 217 assays from an open benchmark (ProteinGym). We also conducted post-hoc analyses of the model&#39;s ability to improve the stability and binding affinity of a VHH antibody. Additionally, we designed 10 new mutants on a DNA polymerase and conducted wet-lab experiments to evaluate their enhanced activity at higher temperatures. Both in silico and experimental evaluations confirmed that our method provides reliable predictions of mutation effects, offering an auxiliary tool for biologists aiming to evolve existing enzymes. The implementation is publicly available at https://github.com/tyang816/ProtREM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21127v1-abstract-full').style.display = 'none'; document.getElementById('2410.21127v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 10 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20974">arXiv:2410.20974</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20974">pdf</a>, <a href="https://arxiv.org/format/2410.20974">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MovieCharacter: A Tuning-Free Framework for Controllable Character Video Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+D">Di Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zheng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+M">Mingyuan Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Changqian Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Huan%2C+J">Junshi Huan</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+X">Xiang Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20974v1-abstract-short" style="display: inline;"> Recent advancements in character video synthesis still depend on extensive fine-tuning or complex 3D modeling processes, which can restrict accessibility and hinder real-time applicability. To address these challenges, we propose a simple yet effective tuning-free framework for character video synthesis, named MovieCharacter, designed to streamline the synthesis process while ensuring high-quality&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20974v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20974v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20974v1-abstract-full" style="display: none;"> Recent advancements in character video synthesis still depend on extensive fine-tuning or complex 3D modeling processes, which can restrict accessibility and hinder real-time applicability. To address these challenges, we propose a simple yet effective tuning-free framework for character video synthesis, named MovieCharacter, designed to streamline the synthesis process while ensuring high-quality outcomes. Our framework decomposes the synthesis task into distinct, manageable modules: character segmentation and tracking, video object removal, character motion imitation, and video composition. This modular design not only facilitates flexible customization but also ensures that each component operates collaboratively to effectively meet user needs. By leveraging existing open-source models and integrating well-established techniques, MovieCharacter achieves impressive synthesis results without necessitating substantial resources or proprietary datasets. Experimental results demonstrate that our framework enhances the efficiency, accessibility, and adaptability of character video synthesis, paving the way for broader creative and interactive applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20974v1-abstract-full').style.display = 'none'; document.getElementById('2410.20974v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20812">arXiv:2410.20812</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20812">pdf</a>, <a href="https://arxiv.org/format/2410.20812">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Fidelity-Imposed Displacement Editing for the Learn2Reg 2024 SHG-BF Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+R">Renjiu Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rongguang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Min Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yaonan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiazheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20812v1-abstract-short" style="display: inline;"> Co-examination of second-harmonic generation (SHG) and bright-field (BF) microscopy enables the differentiation of tissue components and collagen fibers, aiding the analysis of human breast and pancreatic cancer tissues. However, large discrepancies between SHG and BF images pose challenges for current learning-based registration models in aligning SHG to BF. In this paper, we propose a novel mult&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20812v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20812v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20812v1-abstract-full" style="display: none;"> Co-examination of second-harmonic generation (SHG) and bright-field (BF) microscopy enables the differentiation of tissue components and collagen fibers, aiding the analysis of human breast and pancreatic cancer tissues. However, large discrepancies between SHG and BF images pose challenges for current learning-based registration models in aligning SHG to BF. In this paper, we propose a novel multi-modal registration framework that employs fidelity-imposed displacement editing to address these challenges. The framework integrates batch-wise contrastive learning, feature-based pre-alignment, and instance-level optimization. Experimental results from the Learn2Reg COMULISglobe SHG-BF Challenge validate the effectiveness of our method, securing the 1st place on the online leaderboard. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20812v1-abstract-full').style.display = 'none'; document.getElementById('2410.20812v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19743">arXiv:2410.19743</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19743">pdf</a>, <a href="https://arxiv.org/format/2410.19743">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AppBench: Planning of Multiple APIs from Various APPs for Complex User Instruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongru Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+B">Boyang Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+H">Heming Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jingtao Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zeming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+J+Z">Jeff Z. Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+K">Kam-Fai Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19743v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) can interact with the real world by connecting with versatile external APIs, resulting in better problem-solving and task automation capabilities. Previous research primarily focuses on APIs with limited arguments from a single source or overlooks the complex dependency relationship between different APIs. However, it is essential to utilize multiple APIs collaborative&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19743v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19743v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19743v1-abstract-full" style="display: none;"> Large Language Models (LLMs) can interact with the real world by connecting with versatile external APIs, resulting in better problem-solving and task automation capabilities. Previous research primarily focuses on APIs with limited arguments from a single source or overlooks the complex dependency relationship between different APIs. However, it is essential to utilize multiple APIs collaboratively from various sources (e.g., different Apps in the iPhone), especially for complex user instructions. In this paper, we introduce \texttt{AppBench}, the first benchmark to evaluate LLMs&#39; ability to plan and execute multiple APIs from various sources in order to complete the user&#39;s task. Specifically, we consider two significant challenges in multiple APIs: \textit{1) graph structures:} some APIs can be executed independently while others need to be executed one by one, resulting in graph-like execution order; and \textit{2) permission constraints:} which source is authorized to execute the API call. We have experimental results on 9 distinct LLMs; e.g., GPT-4o achieves only a 2.0\% success rate at the most complex instruction, revealing that the existing state-of-the-art LLMs still cannot perform well in this situation even with the help of in-context learning and finetuning. Our code and data are publicly available at https://github.com/ruleGreen/AppBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19743v1-abstract-full').style.display = 'none'; document.getElementById('2410.19743v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19176">arXiv:2410.19176</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19176">pdf</a>, <a href="https://arxiv.org/format/2410.19176">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Perturbation-based Graph Active Learning for Weakly-Supervised Belief Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+D">Dachun Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+R">Ruijie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jinning Li</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+R">Ruipeng Han</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xinyi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+Y">You Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Abdelzaher%2C+T">Tarek Abdelzaher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19176v1-abstract-short" style="display: inline;"> This paper addresses the problem of optimizing the allocation of labeling resources for semi-supervised belief representation learning in social networks. The objective is to strategically identify valuable messages on social media graphs that are worth labeling within a constrained budget, ultimately maximizing the task&#39;s performance. Despite the progress in unsupervised or semi-supervised method&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19176v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19176v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19176v1-abstract-full" style="display: none;"> This paper addresses the problem of optimizing the allocation of labeling resources for semi-supervised belief representation learning in social networks. The objective is to strategically identify valuable messages on social media graphs that are worth labeling within a constrained budget, ultimately maximizing the task&#39;s performance. Despite the progress in unsupervised or semi-supervised methods in advancing belief and ideology representation learning on social networks and the remarkable efficacy of graph learning techniques, the availability of high-quality curated labeled social data can greatly benefit and further improve performances. Consequently, allocating labeling efforts is a critical research problem in scenarios where labeling resources are limited. This paper proposes a graph data augmentation-inspired perturbation-based active learning strategy (PerbALGraph) that progressively selects messages for labeling according to an automatic estimator, obviating human guidance. This estimator is based on the principle that messages in the network that exhibit heightened sensitivity to structural features of the observational data indicate landmark quality that significantly influences semi-supervision processes. We design the estimator to be the prediction variance under a set of designed graph perturbations, which is model-agnostic and application-independent. Extensive experiment results demonstrate the effectiveness of the proposed strategy for belief representation learning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19176v1-abstract-full').style.display = 'none'; document.getElementById('2410.19176v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wang%2C+R&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10