Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 438 results for author: <span class="mathjax">Shen, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Shen%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Shen, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Shen%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Shen, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shen%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shen%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16392">arXiv:2411.16392</a> <span> [<a href="https://arxiv.org/pdf/2411.16392">pdf</a>, <a href="https://arxiv.org/format/2411.16392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Quadratic Gaussian Splatting for Efficient and Detailed Surface Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Binbin Huang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqing Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Liyang Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+X">Xiaojun Xiang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shunhan Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16392v1-abstract-short" style="display: inline;"> Recently, 3D Gaussian Splatting (3DGS) has attracted attention for its superior rendering quality and speed over Neural Radiance Fields (NeRF). To address 3DGS's limitations in surface representation, 2D Gaussian Splatting (2DGS) introduced disks as scene primitives to model and reconstruct geometries from multi-view images, offering view-consistent geometry. However, the disk's first-order linear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16392v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16392v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16392v1-abstract-full" style="display: none;"> Recently, 3D Gaussian Splatting (3DGS) has attracted attention for its superior rendering quality and speed over Neural Radiance Fields (NeRF). To address 3DGS's limitations in surface representation, 2D Gaussian Splatting (2DGS) introduced disks as scene primitives to model and reconstruct geometries from multi-view images, offering view-consistent geometry. However, the disk's first-order linear approximation often leads to over-smoothed results. We propose Quadratic Gaussian Splatting (QGS), a novel method that replaces disks with quadric surfaces, enhancing geometric fitting, whose code will be open-sourced. QGS defines Gaussian distributions in non-Euclidean space, allowing primitives to capture more complex textures. As a second-order surface approximation, QGS also renders spatial curvature to guide the normal consistency term, to effectively reduce over-smoothing. Moreover, QGS is a generalized version of 2DGS that achieves more accurate and detailed reconstructions, as verified by experiments on DTU and TNT, demonstrating its effectiveness in surpassing current state-of-the-art methods in geometry reconstruction. Our code willbe released as open source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16392v1-abstract-full').style.display = 'none'; document.getElementById('2411.16392v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07461">arXiv:2411.07461</a> <span> [<a href="https://arxiv.org/pdf/2411.07461">pdf</a>, <a href="https://arxiv.org/format/2411.07461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BLIP3-KALE: Knowledge Augmented Large-Scale Dense Captions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Awadalla%2C+A">Anas Awadalla</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Le Xue</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+M">Manli Shu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+A">An Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Purushwalkam%2C+S">Senthil Purushwalkam</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sheng Shen</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hannah Lee</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+O">Oscar Lo</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J+S">Jae Sung Park</a>, <a href="/search/cs?searchtype=author&query=Guha%2C+E">Etash Guha</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Schmidt%2C+L">Ludwig Schmidt</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ran Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07461v1-abstract-short" style="display: inline;"> We introduce BLIP3-KALE, a dataset of 218 million image-text pairs that bridges the gap between descriptive synthetic captions and factual web-scale alt-text. KALE augments synthetic dense image captions with web-scale alt-text to generate factually grounded image captions. Our two-stage approach leverages large vision-language models and language models to create knowledge-augmented captions, whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07461v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07461v1-abstract-full" style="display: none;"> We introduce BLIP3-KALE, a dataset of 218 million image-text pairs that bridges the gap between descriptive synthetic captions and factual web-scale alt-text. KALE augments synthetic dense image captions with web-scale alt-text to generate factually grounded image captions. Our two-stage approach leverages large vision-language models and language models to create knowledge-augmented captions, which are then used to train a specialized VLM for scaling up the dataset. We train vision-language models on KALE and demonstrate improvements on vision-language tasks. Our experiments show the utility of KALE for training more capable and knowledgeable multimodal models. We release the KALE dataset at https://huggingface.co/datasets/Salesforce/blip3-kale <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07461v1-abstract-full').style.display = 'none'; document.getElementById('2411.07461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06642">arXiv:2411.06642</a> <span> [<a href="https://arxiv.org/pdf/2411.06642">pdf</a>, <a href="https://arxiv.org/ps/2411.06642">ps</a>, <a href="https://arxiv.org/format/2411.06642">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Antenna Coding Empowered by Pixel Antennas </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shanpu Shen</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K">Kai-Kit Wong</a>, <a href="/search/cs?searchtype=author&query=Murch%2C+R">Ross Murch</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06642v1-abstract-short" style="display: inline;"> Pixel antennas, based on discretizing a continuous radiation surface into small elements called pixels, are a flexible reconfigurable antenna technology. By controlling the connections between pixels via switches, the characteristics of pixel antennas can be adjusted to enhance the wireless channel. Inspired by this, we propose a novel technique denoted antenna coding empowered by pixel antennas.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06642v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06642v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06642v1-abstract-full" style="display: none;"> Pixel antennas, based on discretizing a continuous radiation surface into small elements called pixels, are a flexible reconfigurable antenna technology. By controlling the connections between pixels via switches, the characteristics of pixel antennas can be adjusted to enhance the wireless channel. Inspired by this, we propose a novel technique denoted antenna coding empowered by pixel antennas. We first derive a physical and electromagnetic based communication model for pixel antennas using microwave multiport network theory and beamspace channel representation. With the model, we optimize the antenna coding to maximize the channel gain in a single-input single-output (SISO) pixel antenna system and develop a codebook design for antenna coding to reduce the computational complexity. We analyze the average channel gain of SISO pixel antenna system and derive the corresponding upper bound. In addition, we jointly optimize the antenna coding and transmit signal covariance matrix to maximize the channel capacity in a multiple-input multiple-output (MIMO) pixel antenna system. Simulation results show that using pixel antennas can enhance the average channel gain by up to 5.4 times and channel capacity by up to 3.1 times, demonstrating the significant potential of pixel antennas as a new dimension to design and optimize wireless communication systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06642v1-abstract-full').style.display = 'none'; document.getElementById('2411.06642v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06392">arXiv:2411.06392</a> <span> [<a href="https://arxiv.org/pdf/2411.06392">pdf</a>, <a href="https://arxiv.org/format/2411.06392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> LSMGraph: A High-Performance Dynamic Graph Storage System with Multi-Level CSR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+S">Song Yu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+S">Shufeng Gong</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Q">Qian Tao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sijie Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenyuan Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengxi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongfu Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiaojian Luo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+G">Ge Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06392v2-abstract-short" style="display: inline;"> The growing volume of graph data may exhaust the main memory. It is crucial to design a disk-based graph storage system to ingest updates and analyze graphs efficiently. However, existing dynamic graph storage systems suffer from read or write amplification and face the challenge of optimizing both read and write performance simultaneously. To address this challenge, we propose LSMGraph, a novel d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06392v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06392v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06392v2-abstract-full" style="display: none;"> The growing volume of graph data may exhaust the main memory. It is crucial to design a disk-based graph storage system to ingest updates and analyze graphs efficiently. However, existing dynamic graph storage systems suffer from read or write amplification and face the challenge of optimizing both read and write performance simultaneously. To address this challenge, we propose LSMGraph, a novel dynamic graph storage system that combines the write-friendly LSM-tree and the read-friendly CSR. It leverages the multi-level structure of LSM-trees to optimize write performance while utilizing the compact CSR structures embedded in the LSM-trees to boost read performance. LSMGraph uses a new memory structure, MemGraph, to efficiently cache graph updates and uses a multi-level index to speed up reads within the multi-level structure. Furthermore, LSMGraph incorporates a vertex-grained version control mechanism to mitigate the impact of LSM-tree compaction on read performance and ensure the correctness of concurrent read and write operations. Our evaluation shows that LSMGraph significantly outperforms state-of-the-art (graph) storage systems on both graph update and graph analytical workloads. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06392v2-abstract-full').style.display = 'none'; document.getElementById('2411.06392v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01870">arXiv:2411.01870</a> <span> [<a href="https://arxiv.org/pdf/2411.01870">pdf</a>, <a href="https://arxiv.org/format/2411.01870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mining and Transferring Feature-Geometry Coherence for Unsupervised Point Cloud Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+K">Kezheng Xiong</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+H">Haoen Xiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qingshan Xu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Chenglu Wen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Siqi Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jonathan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01870v1-abstract-short" style="display: inline;"> Point cloud registration, a fundamental task in 3D vision, has achieved remarkable success with learning-based methods in outdoor environments. Unsupervised outdoor point cloud registration methods have recently emerged to circumvent the need for costly pose annotations. However, they fail to establish reliable optimization objectives for unsupervised training, either relying on overly strong geom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01870v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01870v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01870v1-abstract-full" style="display: none;"> Point cloud registration, a fundamental task in 3D vision, has achieved remarkable success with learning-based methods in outdoor environments. Unsupervised outdoor point cloud registration methods have recently emerged to circumvent the need for costly pose annotations. However, they fail to establish reliable optimization objectives for unsupervised training, either relying on overly strong geometric assumptions, or suffering from poor-quality pseudo-labels due to inadequate integration of low-level geometric and high-level contextual information. We have observed that in the feature space, latent new inlier correspondences tend to cluster around respective positive anchors that summarize features of existing inliers. Motivated by this observation, we propose a novel unsupervised registration method termed INTEGER to incorporate high-level contextual information for reliable pseudo-label mining. Specifically, we propose the Feature-Geometry Coherence Mining module to dynamically adapt the teacher for each mini-batch of data during training and discover reliable pseudo-labels by considering both high-level feature representations and low-level geometric cues. Furthermore, we propose Anchor-Based Contrastive Learning to facilitate contrastive learning with anchors for a robust feature space. Lastly, we introduce a Mixed-Density Student to learn density-invariant features, addressing challenges related to density variation and low overlap in the outdoor scenario. Extensive experiments on KITTI and nuScenes datasets demonstrate that our INTEGER achieves competitive performance in terms of accuracy and generalizability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01870v1-abstract-full').style.display = 'none'; document.getElementById('2411.01870v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01800">arXiv:2411.01800</a> <span> [<a href="https://arxiv.org/pdf/2411.01800">pdf</a>, <a href="https://arxiv.org/format/2411.01800">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Expanding Sparse Tuning for Low Memory Usage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shufan Shen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Junshu Sun</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qingming Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuhui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01800v1-abstract-short" style="display: inline;"> Parameter-efficient fine-tuning (PEFT) is an effective method for adapting pre-trained vision models to downstream tasks by tuning a small subset of parameters. Among PEFT methods, sparse tuning achieves superior performance by only adjusting the weights most relevant to downstream tasks, rather than densely tuning the whole weight matrix. However, this performance improvement has been accompanied… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01800v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01800v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01800v1-abstract-full" style="display: none;"> Parameter-efficient fine-tuning (PEFT) is an effective method for adapting pre-trained vision models to downstream tasks by tuning a small subset of parameters. Among PEFT methods, sparse tuning achieves superior performance by only adjusting the weights most relevant to downstream tasks, rather than densely tuning the whole weight matrix. However, this performance improvement has been accompanied by increases in memory usage, which stems from two factors, i.e., the storage of the whole weight matrix as learnable parameters in the optimizer and the additional storage of tunable weight indexes. In this paper, we propose a method named SNELL (Sparse tuning with kerNELized LoRA) for sparse tuning with low memory usage. To achieve low memory usage, SNELL decomposes the tunable matrix for sparsification into two learnable low-rank matrices, saving from the costly storage of the whole original matrix. A competition-based sparsification mechanism is further proposed to avoid the storage of tunable weight indexes. To maintain the effectiveness of sparse tuning with low-rank matrices, we extend the low-rank decomposition by applying nonlinear kernel functions to the whole-matrix merging. Consequently, we gain an increase in the rank of the merged matrix, enhancing the ability of SNELL in adapting the pre-trained models to downstream tasks. Extensive experiments on multiple downstream tasks show that SNELL achieves state-of-the-art performance with low memory usage, endowing PEFT with sparse tuning to large-scale models. Codes are available at https://github.com/ssfgunner/SNELL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01800v1-abstract-full').style.display = 'none'; document.getElementById('2411.01800v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00473">arXiv:2411.00473</a> <span> [<a href="https://arxiv.org/pdf/2411.00473">pdf</a>, <a href="https://arxiv.org/format/2411.00473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Synergistic Interplay of Large Language Model and Digital Twin for Autonomous Optical Networks: Field Demonstrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuchen Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Anni Zhou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yan Shi</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shikui Shen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiongyan Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Danshi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00473v1-abstract-short" style="display: inline;"> The development of large language models (LLM) has revolutionized various fields and is anticipated to drive the advancement of autonomous systems. In the context of autonomous optical networks, creating a high-level cognitive agent in the control layer remains a challenge. However, LLM is primarily developed for natural language processing tasks, rendering them less effective in predicting the ph… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00473v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00473v1-abstract-full" style="display: none;"> The development of large language models (LLM) has revolutionized various fields and is anticipated to drive the advancement of autonomous systems. In the context of autonomous optical networks, creating a high-level cognitive agent in the control layer remains a challenge. However, LLM is primarily developed for natural language processing tasks, rendering them less effective in predicting the physical dynamics of optical communications. Moreover, optical networks demand rigorous stability, where direct deployment of strategies generated from LLM poses safety concerns. In this paper, a digital twin (DT)-enhanced LLM scheme is proposed to facilitate autonomous optical networks. By leveraging monitoring data and advanced models, the DT of optical networks can accurately characterize their physical dynamics, furnishing LLMs with dynamic-updated information for reliable decision-making. Prior to deployment, the generated strategies from LLM can be pre-verified in the DT platform, which also provides feedback to the LLM for further refinement of strategies. The synergistic interplay between DT and LLM for autonomous optical networks is demonstrated through three scenarios: performance optimization under dynamic loadings in an experimental C+L-band long-haul transmission link, protection switching for device upgrading in a field-deployed six-node mesh network, and performance recovery after fiber cuts in a field-deployed C+L-band transmission link. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00473v1-abstract-full').style.display = 'none'; document.getElementById('2411.00473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages,6 figures; Accepted by IEEE Communications Magazine, Open call</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15603">arXiv:2410.15603</a> <span> [<a href="https://arxiv.org/pdf/2410.15603">pdf</a>, <a href="https://arxiv.org/format/2410.15603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Trace-Distance based End-to-End Entanglement Fidelity with Information Preservation in Quantum Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumar%2C+P">Pankaj Kumar</a>, <a href="/search/cs?searchtype=author&query=Kar%2C+B">Binayak Kar</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shan-Hsiang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15603v1-abstract-short" style="display: inline;"> Quantum networks hold the potential to revolutionize a variety of fields by surpassing the capabilities of their classical counterparts. Many of these applications necessitate the sharing of high-fidelity entangled pairs among communicating parties. However, the inherent nature of entanglement leads to an exponential decrease in fidelity as the distance between quantum nodes increases. This phenom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15603v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15603v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15603v1-abstract-full" style="display: none;"> Quantum networks hold the potential to revolutionize a variety of fields by surpassing the capabilities of their classical counterparts. Many of these applications necessitate the sharing of high-fidelity entangled pairs among communicating parties. However, the inherent nature of entanglement leads to an exponential decrease in fidelity as the distance between quantum nodes increases. This phenomenon makes it challenging to generate high-fidelity entangled pairs and preserve information in quantum networks. To tackle this problem, we utilized two strategies to ensure high-fidelity entangled pairs and information preservation within a quantum network. First, we use closeness centrality as a metric to identify the closest nodes in the network. Second, we introduced the trace-distance based path purification (TDPP) algorithm, specifically designed to enable information preservation and path purification entanglement routing. This algorithm identifies the shortest path within quantum networks using closeness centrality and integrates trace-distance computations for distinguishing quantum states and maintaining end-to-end (E2E) entanglement fidelity. Simulation results demonstrate that the proposed algorithm improves network throughput and E2E fidelity while preserving information compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15603v1-abstract-full').style.display = 'none'; document.getElementById('2410.15603v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11064">arXiv:2410.11064</a> <span> [<a href="https://arxiv.org/pdf/2410.11064">pdf</a>, <a href="https://arxiv.org/format/2410.11064">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Parsing altered brain connectivity in neurodevelopmental disorders by integrating graph-based normative modeling and deep generative networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+R+S">Rui Sherry Shen</a>, <a href="/search/cs?searchtype=author&query=Osmanl%C4%B1o%C4%9Flu%2C+Y">Yusuf Osmanl谋o臒lu</a>, <a href="/search/cs?searchtype=author&query=Parker%2C+D">Drew Parker</a>, <a href="/search/cs?searchtype=author&query=Aunapu%2C+D">Darien Aunapu</a>, <a href="/search/cs?searchtype=author&query=Yerys%2C+B+E">Benjamin E. Yerys</a>, <a href="/search/cs?searchtype=author&query=Tun%C3%A7%2C+B">Birkan Tun莽</a>, <a href="/search/cs?searchtype=author&query=Verma%2C+R">Ragini Verma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11064v2-abstract-short" style="display: inline;"> Divergent brain connectivity is thought to underlie the behavioral and cognitive symptoms observed in many neurodevelopmental disorders. Quantifying divergence from neurotypical connectivity patterns offers a promising pathway to inform diagnosis and therapeutic interventions. While advanced neuroimaging techniques, such as diffusion MRI (dMRI), have facilitated the mapping of brain's structural c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11064v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11064v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11064v2-abstract-full" style="display: none;"> Divergent brain connectivity is thought to underlie the behavioral and cognitive symptoms observed in many neurodevelopmental disorders. Quantifying divergence from neurotypical connectivity patterns offers a promising pathway to inform diagnosis and therapeutic interventions. While advanced neuroimaging techniques, such as diffusion MRI (dMRI), have facilitated the mapping of brain's structural connectome, the challenge lies in accurately modeling developmental trajectories within these complex networked structures to create robust neurodivergence markers. In this work, we present the Brain Representation via Individualized Deep Generative Embedding (BRIDGE) framework, which integrates normative modeling with a bio-inspired deep generative model to create a reference trajectory of connectivity transformation as part of neurotypical development. This will enable the assessment of neurodivergence by comparing individuals to the established neurotypical trajectory. BRIDGE provides a global neurodivergence score based on the difference between connectivity-based brain age and chronological age, along with region-wise neurodivergence maps that highlight localized connectivity differences. Application of BRIDGE to a large cohort of children with autism spectrum disorder demonstrates that the global neurodivergence score correlates with clinical assessments in autism, and the regional map offers insights into the heterogeneity at the individual level in neurodevelopmental disorders. Together, the neurodivergence score and map form powerful tools for quantifying developmental divergence in connectivity patterns, advancing the development of imaging markers for personalized diagnosis and intervention in various clinical contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11064v2-abstract-full').style.display = 'none'; document.getElementById('2410.11064v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10287">arXiv:2410.10287</a> <span> [<a href="https://arxiv.org/pdf/2410.10287">pdf</a>, <a href="https://arxiv.org/format/2410.10287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Manifold-Aware Local Feature Modeling for Semi-Supervised Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sicheng Shen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jinming Cao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yifang Yin</a>, <a href="/search/cs?searchtype=author&query=Zimmermann%2C+R">Roger Zimmermann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10287v1-abstract-short" style="display: inline;"> Achieving precise medical image segmentation is vital for effective treatment planning and accurate disease diagnosis. Traditional fully-supervised deep learning methods, though highly precise, are heavily reliant on large volumes of labeled data, which are often difficult to obtain due to the expertise required for medical annotations. This has led to the rise of semi-supervised learning approach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10287v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10287v1-abstract-full" style="display: none;"> Achieving precise medical image segmentation is vital for effective treatment planning and accurate disease diagnosis. Traditional fully-supervised deep learning methods, though highly precise, are heavily reliant on large volumes of labeled data, which are often difficult to obtain due to the expertise required for medical annotations. This has led to the rise of semi-supervised learning approaches that utilize both labeled and unlabeled data to mitigate the label scarcity issue. In this paper, we introduce the Manifold-Aware Local Feature Modeling Network (MANet), which enhances the U-Net architecture by incorporating manifold supervision signals. This approach focuses on improving boundary accuracy, which is crucial for reliable medical diagnosis. To further extend the versatility of our method, we propose two variants: MA-Sobel and MA-Canny. The MA-Sobel variant employs the Sobel operator, which is effective for both 2D and 3D data, while the MA-Canny variant utilizes the Canny operator, specifically designed for 2D images, to refine boundary detection. These variants allow our method to adapt to various medical image modalities and dimensionalities, ensuring broader applicability. Our extensive experiments on datasets such as ACDC, LA, and Pancreas-NIH demonstrate that MANet consistently surpasses state-of-the-art methods in performance metrics like Dice and Jaccard scores. The proposed method also shows improved generalization across various semi-supervised segmentation networks, highlighting its robustness and effectiveness. Visual analysis of segmentation results confirms that MANet offers clearer and more accurate class boundaries, underscoring the value of manifold information in medical image segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10287v1-abstract-full').style.display = 'none'; document.getElementById('2410.10287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09374">arXiv:2410.09374</a> <span> [<a href="https://arxiv.org/pdf/2410.09374">pdf</a>, <a href="https://arxiv.org/format/2410.09374">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+J">Junkai Niu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xiuyuan Lu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a>, <a href="/search/cs?searchtype=author&query=Gallego%2C+G">Guillermo Gallego</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09374v1-abstract-short" style="display: inline;"> Event-based visual odometry is a specific branch of visual Simultaneous Localization and Mapping (SLAM) techniques, which aims at solving tracking and mapping sub-problems in parallel by exploiting the special working principles of neuromorphic (ie, event-based) cameras. Due to the motion-dependent nature of event data, explicit data association ie, feature matching under large-baseline view-point… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09374v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09374v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09374v1-abstract-full" style="display: none;"> Event-based visual odometry is a specific branch of visual Simultaneous Localization and Mapping (SLAM) techniques, which aims at solving tracking and mapping sub-problems in parallel by exploiting the special working principles of neuromorphic (ie, event-based) cameras. Due to the motion-dependent nature of event data, explicit data association ie, feature matching under large-baseline view-point changes is hardly established, making direct methods a more rational choice. However, state-of-the-art direct methods are limited by the high computational complexity of the mapping sub-problem and the degeneracy of camera pose tracking in certain degrees of freedom (DoF) in rotation. In this paper, we resolve these issues by building an event-based stereo visual-inertial odometry system on top of our previous direct pipeline Event-based Stereo Visual Odometry. Specifically, to speed up the mapping operation, we propose an efficient strategy for sampling contour points according to the local dynamics of events. The mapping performance is also improved in terms of structure completeness and local smoothness by merging the temporal stereo and static stereo results. To circumvent the degeneracy of camera pose tracking in recovering the pitch and yaw components of general six-DoF motion, we introduce IMU measurements as motion priors via pre-integration. To this end, a compact back-end is proposed for continuously updating the IMU bias and predicting the linear velocity, enabling an accurate motion prediction for camera pose tracking. The resulting system scales well with modern high-resolution event cameras and leads to better global positioning accuracy in large-scale outdoor environments. Extensive evaluations on five publicly available datasets featuring different resolutions and scenarios justify the superior performance of the proposed system against five state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09374v1-abstract-full').style.display = 'none'; document.getElementById('2410.09374v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06195">arXiv:2410.06195</a> <span> [<a href="https://arxiv.org/pdf/2410.06195">pdf</a>, <a href="https://arxiv.org/format/2410.06195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Entering Real Social World! Benchmarking the Theory of Mind and Socialization Capabilities of LLMs from a First-person Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hou%2C+G">Guiyang Hou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yongliang Shen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zeqi Tan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sihao Shen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Weiming Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06195v1-abstract-short" style="display: inline;"> In the social world, humans possess the capability to infer and reason about others mental states (such as emotions, beliefs, and intentions), known as the Theory of Mind (ToM). Simultaneously, humans own mental states evolve in response to social situations, a capability we refer to as socialization. Together, these capabilities form the foundation of human social interaction. In the era of artif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06195v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06195v1-abstract-full" style="display: none;"> In the social world, humans possess the capability to infer and reason about others mental states (such as emotions, beliefs, and intentions), known as the Theory of Mind (ToM). Simultaneously, humans own mental states evolve in response to social situations, a capability we refer to as socialization. Together, these capabilities form the foundation of human social interaction. In the era of artificial intelligence (AI), especially with the development of large language models (LLMs), we raise an intriguing question: How do LLMs perform in terms of ToM and socialization capabilities? And more broadly, can these AI models truly enter and navigate the real social world? Existing research evaluating LLMs ToM and socialization capabilities by positioning LLMs as passive observers from a third person perspective, rather than as active participants. However, compared to the third-person perspective, observing and understanding the world from an egocentric first person perspective is a natural approach for both humans and AI agents. The ToM and socialization capabilities of LLMs from a first person perspective, a crucial attribute for advancing embodied AI agents, remain unexplored. To answer the aforementioned questions and bridge the research gap, we introduce EgoSocialArena, a novel framework designed to evaluate and investigate the ToM and socialization capabilities of LLMs from a first person perspective. It encompasses two evaluation environments: static environment and interactive environment, with seven scenarios: Daily Life, Counterfactual, New World, Blackjack, Number Guessing, and Limit Texas Hold em, totaling 2,195 data entries. With EgoSocialArena, we have conducted a comprehensive evaluation of nine advanced LLMs and observed some key insights regarding the future development of LLMs as well as the capabilities levels of the most advanced LLMs currently available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06195v1-abstract-full').style.display = 'none'; document.getElementById('2410.06195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01966">arXiv:2410.01966</a> <span> [<a href="https://arxiv.org/pdf/2410.01966">pdf</a>, <a href="https://arxiv.org/format/2410.01966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Screen Time Identification in Children with a Multi-View Vision Language Model and Screen Time Tracker </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hou%2C+X">Xinlong Hou</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sen Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xueshen Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinran Gao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziyi Huang</a>, <a href="/search/cs?searchtype=author&query=Holiday%2C+S+J">Steven J. Holiday</a>, <a href="/search/cs?searchtype=author&query=Cribbet%2C+M+R">Matthew R. Cribbet</a>, <a href="/search/cs?searchtype=author&query=White%2C+S+W">Susan W. White</a>, <a href="/search/cs?searchtype=author&query=Sazonov%2C+E">Edward Sazonov</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Y">Yu Gan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01966v2-abstract-short" style="display: inline;"> Being able to accurately monitor the screen exposure of young children is important for research on phenomena linked to screen use such as childhood obesity, physical activity, and social interaction. Most existing studies rely upon self-report or manual measures from bulky wearable sensors, thus lacking efficiency and accuracy in capturing quantitative screen exposure data. In this work, we devel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01966v2-abstract-full').style.display = 'inline'; document.getElementById('2410.01966v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01966v2-abstract-full" style="display: none;"> Being able to accurately monitor the screen exposure of young children is important for research on phenomena linked to screen use such as childhood obesity, physical activity, and social interaction. Most existing studies rely upon self-report or manual measures from bulky wearable sensors, thus lacking efficiency and accuracy in capturing quantitative screen exposure data. In this work, we developed a novel sensor informatics framework that utilizes egocentric images from a wearable sensor, termed the screen time tracker (STT), and a vision language model (VLM). In particular, we devised a multi-view VLM that takes multiple views from egocentric image sequences and interprets screen exposure dynamically. We validated our approach by using a dataset of children's free-living activities, demonstrating significant improvement over existing methods in plain vision language models and object detection models. Results supported the promise of this monitoring approach, which could optimize behavioral research on screen exposure in children's naturalistic settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01966v2-abstract-full').style.display = 'none'; document.getElementById('2410.01966v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Prepare for submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15715">arXiv:2409.15715</a> <span> [<a href="https://arxiv.org/pdf/2409.15715">pdf</a>, <a href="https://arxiv.org/format/2409.15715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Disentangled Generation and Aggregation for Robust Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shihe Shen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huachen Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wangze Xu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+R">Rui Peng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+L">Luyang Tang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+K">Kaiqiang Xiong</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jianbo Jiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ronggang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15715v1-abstract-short" style="display: inline;"> The utilization of the triplane-based radiance fields has gained attention in recent years due to its ability to effectively disentangle 3D scenes with a high-quality representation and low computation cost. A key requirement of this method is the precise input of camera poses. However, due to the local update property of the triplane, a similar joint estimation as previous joint pose-NeRF optimiz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15715v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15715v1-abstract-full" style="display: none;"> The utilization of the triplane-based radiance fields has gained attention in recent years due to its ability to effectively disentangle 3D scenes with a high-quality representation and low computation cost. A key requirement of this method is the precise input of camera poses. However, due to the local update property of the triplane, a similar joint estimation as previous joint pose-NeRF optimization works easily results in local minima. To this end, we propose the Disentangled Triplane Generation module to introduce global feature context and smoothness into triplane learning, which mitigates errors caused by local updating. Then, we propose the Disentangled Plane Aggregation to mitigate the entanglement caused by the common triplane feature aggregation during camera pose updating. In addition, we introduce a two-stage warm-start training strategy to reduce the implicit constraints caused by the triplane generator. Quantitative and qualitative results demonstrate that our proposed method achieves state-of-the-art performance in novel view synthesis with noisy or unknown camera poses, as well as efficient convergence of optimization. Project page: https://gaohchen.github.io/DiGARR/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15715v1-abstract-full').style.display = 'none'; document.getElementById('2409.15715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 11 figures, Accepted by ECCV'2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14316">arXiv:2409.14316</a> <span> [<a href="https://arxiv.org/pdf/2409.14316">pdf</a>, <a href="https://arxiv.org/format/2409.14316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MVPGS: Excavating Multi-view Priors for Gaussian Splatting from Sparse Input Views </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wangze Xu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huachen Gao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shihe Shen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+R">Rui Peng</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jianbo Jiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ronggang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14316v1-abstract-short" style="display: inline;"> Recently, the Neural Radiance Field (NeRF) advancement has facilitated few-shot Novel View Synthesis (NVS), which is a significant challenge in 3D vision applications. Despite numerous attempts to reduce the dense input requirement in NeRF, it still suffers from time-consumed training and rendering processes. More recently, 3D Gaussian Splatting (3DGS) achieves real-time high-quality rendering wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14316v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14316v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14316v1-abstract-full" style="display: none;"> Recently, the Neural Radiance Field (NeRF) advancement has facilitated few-shot Novel View Synthesis (NVS), which is a significant challenge in 3D vision applications. Despite numerous attempts to reduce the dense input requirement in NeRF, it still suffers from time-consumed training and rendering processes. More recently, 3D Gaussian Splatting (3DGS) achieves real-time high-quality rendering with an explicit point-based representation. However, similar to NeRF, it tends to overfit the train views for lack of constraints. In this paper, we propose \textbf{MVPGS}, a few-shot NVS method that excavates the multi-view priors based on 3D Gaussian Splatting. We leverage the recent learning-based Multi-view Stereo (MVS) to enhance the quality of geometric initialization for 3DGS. To mitigate overfitting, we propose a forward-warping method for additional appearance constraints conforming to scenes based on the computed geometry. Furthermore, we introduce a view-consistent geometry constraint for Gaussian parameters to facilitate proper optimization convergence and utilize a monocular depth regularization as compensation. Experiments show that the proposed method achieves state-of-the-art performance with real-time rendering speed. Project page: https://zezeaaa.github.io/projects/MVPGS/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14316v1-abstract-full').style.display = 'none'; document.getElementById('2409.14316v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024, Project page: https://zezeaaa.github.io/projects/MVPGS/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13993">arXiv:2409.13993</a> <span> [<a href="https://arxiv.org/pdf/2409.13993">pdf</a>, <a href="https://arxiv.org/format/2409.13993">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Integrated Decision Making and Trajectory Planning for Autonomous Driving Under Multimodal Uncertainties: A Bayesian Game Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhenmin Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tong Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13993v1-abstract-short" style="display: inline;"> Modeling the interaction between traffic agents is a key issue in designing safe and non-conservative maneuvers in autonomous driving. This problem can be challenging when multi-modality and behavioral uncertainties are engaged. Existing methods either fail to plan interactively or consider unimodal behaviors that could lead to catastrophic results. In this paper, we introduce an integrated decisi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13993v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13993v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13993v1-abstract-full" style="display: none;"> Modeling the interaction between traffic agents is a key issue in designing safe and non-conservative maneuvers in autonomous driving. This problem can be challenging when multi-modality and behavioral uncertainties are engaged. Existing methods either fail to plan interactively or consider unimodal behaviors that could lead to catastrophic results. In this paper, we introduce an integrated decision-making and trajectory planning framework based on Bayesian game (i.e., game of incomplete information). Human decisions inherently exhibit discrete characteristics and therefore are modeled as types of players in the game. A general solver based on no-regret learning is introduced to obtain a corresponding Bayesian Coarse Correlated Equilibrium, which captures the interaction between traffic agents in the multimodal context. With the attained equilibrium, decision-making and trajectory planning are performed simultaneously, and the resulting interactive strategy is shown to be optimal over the expectation of rivals' driving intentions. Closed-loop simulations on different traffic scenarios are performed to illustrate the generalizability and the effectiveness of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13993v1-abstract-full').style.display = 'none'; document.getElementById('2409.13993v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13449">arXiv:2409.13449</a> <span> [<a href="https://arxiv.org/pdf/2409.13449">pdf</a>, <a href="https://arxiv.org/format/2409.13449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Minstrel: Structural Prompt Generation with Multi-Agents Coordination for Non-AI Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Ming Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuanzhong Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaoyu Liang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yijie Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Daling Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaocui Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sijia Shen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shi Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoming Zhang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+C">Chaofeng Guan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13449v1-abstract-short" style="display: inline;"> LLMs have demonstrated commendable performance across diverse domains. Nevertheless, formulating high-quality prompts to assist them in their work poses a challenge for non-AI experts. Existing research in prompt engineering suggests somewhat scattered optimization principles and designs empirically dependent prompt optimizers. Unfortunately, these endeavors lack a structural design, incurring hig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13449v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13449v1-abstract-full" style="display: none;"> LLMs have demonstrated commendable performance across diverse domains. Nevertheless, formulating high-quality prompts to assist them in their work poses a challenge for non-AI experts. Existing research in prompt engineering suggests somewhat scattered optimization principles and designs empirically dependent prompt optimizers. Unfortunately, these endeavors lack a structural design, incurring high learning costs and it is not conducive to the iterative updating of prompts, especially for non-AI experts. Inspired by structured reusable programming languages, we propose LangGPT, a structural prompt design framework. Furthermore, we introduce Minstrel, a multi-generative agent system with reflection to automate the generation of structural prompts. Experiments and the case study illustrate that structural prompts generated by Minstrel or written manually significantly enhance the performance of LLMs. Furthermore, we analyze the ease of use of structural prompts through a user survey in our online community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13449v1-abstract-full').style.display = 'none'; document.getElementById('2409.13449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2402.16929</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12412">arXiv:2409.12412</a> <span> [<a href="https://arxiv.org/pdf/2409.12412">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> How to predict on-road air pollution based on street view images and machine learning: a quantitative analysis of the optimal strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Hui Zhong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Di Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengqin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenrui Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yonghong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meixin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12412v1-abstract-short" style="display: inline;"> On-road air pollution exhibits substantial variability over short distances due to emission sources, dilution, and physicochemical processes. Integrating mobile monitoring data with street view images (SVIs) holds promise for predicting local air pollution. However, algorithms, sampling strategies, and image quality introduce extra errors due to a lack of reliable references that quantify their ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12412v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12412v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12412v1-abstract-full" style="display: none;"> On-road air pollution exhibits substantial variability over short distances due to emission sources, dilution, and physicochemical processes. Integrating mobile monitoring data with street view images (SVIs) holds promise for predicting local air pollution. However, algorithms, sampling strategies, and image quality introduce extra errors due to a lack of reliable references that quantify their effects. To bridge this gap, we employed 314 taxis to monitor NO, NO2, PM2.5 and PM10 dynamically and sampled corresponding SVIs, aiming to develop a reliable strategy. We extracted SVI features from ~ 382,000 streetscape images, which were collected at various angles (0掳, 90掳, 180掳, 270掳) and ranges (buffers with radii of 100m, 200m, 300m, 400m, 500m). Also, three machine learning algorithms alongside the linear land-used regression (LUR) model were experimented with to explore the influences of different algorithms. Four typical image quality issues were identified and discussed. Generally, machine learning methods outperform linear LUR for estimating the four pollutants, with the ranking: random forest > XGBoost > neural network > LUR. Compared to single-angle sampling, the averaging strategy is an effective method to avoid bias of insufficient feature capture. Therefore, the optimal sampling strategy is to obtain SVIs at a 100m radius buffer and extract features using the averaging strategy. This approach achieved estimation results for each aggregation location with absolute errors almost less than 2.5 渭g/m^2 or ppb. Overexposure, blur, and underexposure led to image misjudgments and incorrect identifications, causing an overestimation of road features and underestimation of human-activity features, contributing to inaccurate NO, NO2, PM2.5 and PM10 estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12412v1-abstract-full').style.display = 'none'; document.getElementById('2409.12412v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11981">arXiv:2409.11981</a> <span> [<a href="https://arxiv.org/pdf/2409.11981">pdf</a>, <a href="https://arxiv.org/format/2409.11981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LMMCoDrive: Cooperative Driving with Large Multimodal Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haichao Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+R">Ruoyu Yao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhenmin Huang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11981v1-abstract-short" style="display: inline;"> To address the intricate challenges of decentralized cooperative scheduling and motion planning in Autonomous Mobility-on-Demand (AMoD) systems, this paper introduces LMMCoDrive, a novel cooperative driving framework that leverages a Large Multimodal Model (LMM) to enhance traffic efficiency in dynamic urban environments. This framework seamlessly integrates scheduling and motion planning processe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11981v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11981v1-abstract-full" style="display: none;"> To address the intricate challenges of decentralized cooperative scheduling and motion planning in Autonomous Mobility-on-Demand (AMoD) systems, this paper introduces LMMCoDrive, a novel cooperative driving framework that leverages a Large Multimodal Model (LMM) to enhance traffic efficiency in dynamic urban environments. This framework seamlessly integrates scheduling and motion planning processes to ensure the effective operation of Cooperative Autonomous Vehicles (CAVs). The spatial relationship between CAVs and passenger requests is abstracted into a Bird's-Eye View (BEV) to fully exploit the potential of the LMM. Besides, trajectories are cautiously refined for each CAV while ensuring collision avoidance through safety constraints. A decentralized optimization strategy, facilitated by the Alternating Direction Method of Multipliers (ADMM) within the LMM framework, is proposed to drive the graph evolution of CAVs. Simulation results demonstrate the pivotal role and significant impact of LMM in optimizing CAV scheduling and enhancing decentralized cooperative optimization process for each vehicle. This marks a substantial stride towards achieving practical, efficient, and safe AMoD systems that are poised to revolutionize urban transportation. The code is available at https://github.com/henryhcliu/LMMCoDrive. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11981v1-abstract-full').style.display = 'none'; document.getElementById('2409.11981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08681">arXiv:2409.08681</a> <span> [<a href="https://arxiv.org/pdf/2409.08681">pdf</a>, <a href="https://arxiv.org/format/2409.08681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SLIM: Scalable and Lightweight LiDAR Mapping in Urban Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zehuan Yu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Z">Zhijian Qiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyi Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Huan Yin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08681v1-abstract-short" style="display: inline;"> LiDAR point cloud maps are extensively utilized on roads for robot navigation due to their high consistency. However, dense point clouds face challenges of high memory consumption and reduced maintainability for long-term operations. In this study, we introduce SLIM, a scalable and lightweight mapping system for long-term LiDAR mapping in urban environments. The system begins by parameterizing str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08681v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08681v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08681v1-abstract-full" style="display: none;"> LiDAR point cloud maps are extensively utilized on roads for robot navigation due to their high consistency. However, dense point clouds face challenges of high memory consumption and reduced maintainability for long-term operations. In this study, we introduce SLIM, a scalable and lightweight mapping system for long-term LiDAR mapping in urban environments. The system begins by parameterizing structural point clouds into lines and planes. These lightweight and structural representations meet the requirements of map merging, pose graph optimization, and bundle adjustment, ensuring incremental management and local consistency. For long-term operations, a map-centric nonlinear factor recovery method is designed to sparsify poses while preserving mapping accuracy. We validate the SLIM system with multi-session real-world LiDAR data from classical LiDAR mapping datasets, including KITTI, NCLT, and HeLiPR. The experiments demonstrate its capabilities in mapping accuracy, lightweightness, and scalability. Map re-use is also verified through map-based robot localization. Ultimately, with multi-session LiDAR data, the SLIM system provides a globally consistent map with low memory consumption (130 KB/km). We have made our code open-source to benefit the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08681v1-abstract-full').style.display = 'none'; document.getElementById('2409.08681v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07763">arXiv:2409.07763</a> <span> [<a href="https://arxiv.org/pdf/2409.07763">pdf</a>, <a href="https://arxiv.org/format/2409.07763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reimagining Linear Probing: Kolmogorov-Arnold Networks in Transfer Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sheng Shen</a>, <a href="/search/cs?searchtype=author&query=Younes%2C+R">Rabih Younes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07763v1-abstract-short" style="display: inline;"> This paper introduces Kolmogorov-Arnold Networks (KAN) as an enhancement to the traditional linear probing method in transfer learning. Linear probing, often applied to the final layer of pre-trained models, is limited by its inability to model complex relationships in data. To address this, we propose substituting the linear probing layer with KAN, which leverages spline-based representations to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07763v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07763v1-abstract-full" style="display: none;"> This paper introduces Kolmogorov-Arnold Networks (KAN) as an enhancement to the traditional linear probing method in transfer learning. Linear probing, often applied to the final layer of pre-trained models, is limited by its inability to model complex relationships in data. To address this, we propose substituting the linear probing layer with KAN, which leverages spline-based representations to approximate intricate functions. In this study, we integrate KAN with a ResNet-50 model pre-trained on ImageNet and evaluate its performance on the CIFAR-10 dataset. We perform a systematic hyperparameter search, focusing on grid size and spline degree (k), to optimize KAN's flexibility and accuracy. Our results demonstrate that KAN consistently outperforms traditional linear probing, achieving significant improvements in accuracy and generalization across a range of configurations. These findings indicate that KAN offers a more powerful and adaptable alternative to conventional linear probing techniques in transfer learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07763v1-abstract-full').style.display = 'none'; document.getElementById('2409.07763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06124">arXiv:2409.06124</a> <span> [<a href="https://arxiv.org/pdf/2409.06124">pdf</a>, <a href="https://arxiv.org/format/2409.06124">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Human Impedance Modulation to Improve Visuo-Haptic Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xiaoxiao Cheng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shixian Shen</a>, <a href="/search/cs?searchtype=author&query=Ivanova%2C+E">Ekaterina Ivanova</a>, <a href="/search/cs?searchtype=author&query=Carboni%2C+G">Gerolamo Carboni</a>, <a href="/search/cs?searchtype=author&query=Takagi%2C+A">Atsushi Takagi</a>, <a href="/search/cs?searchtype=author&query=Burdet%2C+E">Etienne Burdet</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06124v1-abstract-short" style="display: inline;"> Humans activate muscles to shape the mechanical interaction with their environment, but can they harness this control mechanism to best sense the environment? We investigated how participants adapt their muscle activation to visual and haptic information when tracking a randomly moving target with a robotic interface. The results exhibit a differentiated effect of these sensory modalities, where p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06124v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06124v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06124v1-abstract-full" style="display: none;"> Humans activate muscles to shape the mechanical interaction with their environment, but can they harness this control mechanism to best sense the environment? We investigated how participants adapt their muscle activation to visual and haptic information when tracking a randomly moving target with a robotic interface. The results exhibit a differentiated effect of these sensory modalities, where participants' muscle cocontraction increases with the haptic noise and decreases with the visual noise, in apparent contradiction to previous results. These results can be explained, and reconciled with previous findings, when considering muscle spring like mechanics, where stiffness increases with cocontraction to regulate motion guidance. Increasing cocontraction to more closely follow the motion plan favors accurate visual over haptic information, while decreasing it avoids injecting visual noise and relies on accurate haptic information. We formulated this active sensing mechanism as the optimization of visuo-haptic information and effort. This OIE model can explain the adaptation of muscle activity to unimodal and multimodal sensory information when interacting with fixed or dynamic environments, or with another human, and can be used to optimize human-robot interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06124v1-abstract-full').style.display = 'none'; document.getElementById('2409.06124v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04398">arXiv:2409.04398</a> <span> [<a href="https://arxiv.org/pdf/2409.04398">pdf</a>, <a href="https://arxiv.org/format/2409.04398">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TPAMI.2024.3457229">10.1109/TPAMI.2024.3457229 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> HiSC4D: Human-centered interaction and 4D Scene Capture in Large-scale Space Using Wearable IMUs and LiDAR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yudi Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiping Lin</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Chenglu Wen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Lan Xu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Siqi Shen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yuexin Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04398v3-abstract-short" style="display: inline;"> We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture method, aimed at accurately and efficiently creating a dynamic digital world, containing large-scale indoor-outdoor scenes, diverse human motions, rich human-human interactions, and human-environment interactions. By utilizing body-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human motions in uncon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04398v3-abstract-full').style.display = 'inline'; document.getElementById('2409.04398v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04398v3-abstract-full" style="display: none;"> We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture method, aimed at accurately and efficiently creating a dynamic digital world, containing large-scale indoor-outdoor scenes, diverse human motions, rich human-human interactions, and human-environment interactions. By utilizing body-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human motions in unconstrained space without the need for external devices and pre-built maps. This affords great flexibility and accessibility for human-centered interaction and 4D scene capturing in various environments. Taking into account that IMUs can capture human spatially unrestricted poses but are prone to drifting for long-period using, and while LiDAR is stable for global localization but rough for local positions and orientations, HiSC4D employs a joint optimization method, harmonizing all sensors and utilizing environment cues, yielding promising results for long-term capture in large scenes. To promote research of egocentric human interaction in large scenes and facilitate downstream tasks, we also present a dataset, containing 8 sequences in 4 large scenes (200 to 5,000 $m^2$), providing 36k frames of accurate 4D human motions with SMPL annotations and dynamic scenes, 31k frames of cropped human point clouds, and scene mesh of the environment. A variety of scenarios, such as the basketball gym and commercial street, alongside challenging human motions, such as daily greeting, one-on-one basketball playing, and tour guiding, demonstrate the effectiveness and the generalization ability of HiSC4D. The dataset and code will be publicated on www.lidarhumanmotion.net/hisc4d available for research purposes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04398v3-abstract-full').style.display = 'none'; document.getElementById('2409.04398v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 10 figures, Jornal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03634">arXiv:2409.03634</a> <span> [<a href="https://arxiv.org/pdf/2409.03634">pdf</a>, <a href="https://arxiv.org/format/2409.03634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Surface-Centric Modeling for High-Fidelity Generalizable Neural Surface Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+R">Rui Peng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shihe Shen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+K">Kaiqiang Xiong</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huachen Gao</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jianbo Jiao</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xiaodong Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ronggang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03634v1-abstract-short" style="display: inline;"> Reconstructing the high-fidelity surface from multi-view images, especially sparse images, is a critical and practical task that has attracted widespread attention in recent years. However, existing methods are impeded by the memory constraint or the requirement of ground-truth depths and cannot recover satisfactory geometric details. To this end, we propose SuRF, a new Surface-centric framework t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03634v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03634v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03634v1-abstract-full" style="display: none;"> Reconstructing the high-fidelity surface from multi-view images, especially sparse images, is a critical and practical task that has attracted widespread attention in recent years. However, existing methods are impeded by the memory constraint or the requirement of ground-truth depths and cannot recover satisfactory geometric details. To this end, we propose SuRF, a new Surface-centric framework that incorporates a new Region sparsification based on a matching Field, achieving good trade-offs between performance, efficiency and scalability. To our knowledge, this is the first unsupervised method achieving end-to-end sparsification powered by the introduced matching field, which leverages the weight distribution to efficiently locate the boundary regions containing surface. Instead of predicting an SDF value for each voxel, we present a new region sparsification approach to sparse the volume by judging whether the voxel is inside the surface region. In this way, our model can exploit higher frequency features around the surface with less memory and computational consumption. Extensive experiments on multiple benchmarks containing complex large-scale scenes show that our reconstructions exhibit high-quality details and achieve new state-of-the-art performance, i.e., 46% improvements with 80% less memory consumption. Code is available at https://github.com/prstrive/SuRF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03634v1-abstract-full').style.display = 'none'; document.getElementById('2409.03634v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024 Accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02738">arXiv:2409.02738</a> <span> [<a href="https://arxiv.org/pdf/2409.02738">pdf</a>, <a href="https://arxiv.org/format/2409.02738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SOAR: Simultaneous Exploration and Photographing with Heterogeneous UAVs for Fast Autonomous Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zengzhi Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+G">Guiyong Zheng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yiming Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jinni Zhou</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Boyu Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02738v1-abstract-short" style="display: inline;"> Unmanned Aerial Vehicles (UAVs) have gained significant popularity in scene reconstruction. This paper presents SOAR, a LiDAR-Visual heterogeneous multi-UAV system specifically designed for fast autonomous reconstruction of complex environments. Our system comprises a LiDAR-equipped explorer with a large field-of-view (FoV), alongside photographers equipped with cameras. To ensure rapid acquisitio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02738v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02738v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02738v1-abstract-full" style="display: none;"> Unmanned Aerial Vehicles (UAVs) have gained significant popularity in scene reconstruction. This paper presents SOAR, a LiDAR-Visual heterogeneous multi-UAV system specifically designed for fast autonomous reconstruction of complex environments. Our system comprises a LiDAR-equipped explorer with a large field-of-view (FoV), alongside photographers equipped with cameras. To ensure rapid acquisition of the scene's surface geometry, we employ a surface frontier-based exploration strategy for the explorer. As the surface is progressively explored, we identify the uncovered areas and generate viewpoints incrementally. These viewpoints are then assigned to photographers through solving a Consistent Multiple Depot Multiple Traveling Salesman Problem (Consistent-MDMTSP), which optimizes scanning efficiency while ensuring task consistency. Finally, photographers utilize the assigned viewpoints to determine optimal coverage paths for acquiring images. We present extensive benchmarks in the realistic simulator, which validates the performance of SOAR compared with classical and state-of-the-art methods. For more details, please see our project page at https://sysu-star.github.io/SOAR}{sysu-star.github.io/SOAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02738v1-abstract-full').style.display = 'none'; document.getElementById('2409.02738v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IROS2024. Code: https://github.com/SYSU-STAR/SOAR. Project page: http://sysu-star.com/SOAR/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02119">arXiv:2409.02119</a> <span> [<a href="https://arxiv.org/pdf/2409.02119">pdf</a>, <a href="https://arxiv.org/format/2409.02119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CoRA: Optimizing Low-Rank Adaptation with Common Subspace of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xiaojun Xiao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sen Shen</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Q">Qiming Bao</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+H">Hongfei Rong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kairui Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiamou Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02119v1-abstract-short" style="display: inline;"> In fine-tuning large language models (LLMs), conserving computational resources while maintaining effectiveness and improving outcomes within the same computational constraints is crucial. The Low-Rank Adaptation (LoRA) strategy balances efficiency and performance in fine-tuning large models by reducing the number of trainable parameters and computational costs. However, current advancements in Lo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02119v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02119v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02119v1-abstract-full" style="display: none;"> In fine-tuning large language models (LLMs), conserving computational resources while maintaining effectiveness and improving outcomes within the same computational constraints is crucial. The Low-Rank Adaptation (LoRA) strategy balances efficiency and performance in fine-tuning large models by reducing the number of trainable parameters and computational costs. However, current advancements in LoRA might be focused on its fine-tuning methodologies, with not as much exploration as might be expected into further compression of LoRA. Since most of LoRA's parameters might still be superfluous, this may lead to unnecessary wastage of computational resources. In this paper, we propose \textbf{CoRA}: leveraging shared knowledge to optimize LoRA training by substituting its matrix $B$ with a common subspace from large models. Our two-fold method includes (1) Freezing the substitute matrix $B$ to halve parameters while training matrix $A$ for specific tasks and (2) Using the substitute matrix $B$ as an enhanced initial state for the original matrix $B$, achieving improved results with the same parameters. Our experiments show that the first approach achieves the same efficacy as the original LoRA fine-tuning while being more efficient than halving parameters. At the same time, the second approach has some improvements compared to LoRA's original fine-tuning performance. They generally attest to the effectiveness of our work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02119v1-abstract-full').style.display = 'none'; document.getElementById('2409.02119v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14909">arXiv:2408.14909</a> <span> [<a href="https://arxiv.org/pdf/2408.14909">pdf</a>, <a href="https://arxiv.org/format/2408.14909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> SpikingSSMs: Learning Long Sequences with Sparse and Parallel Spiking State Space Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shuaijie Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chao Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Renzhuo Huang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yan Zhong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qinghai Guo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhichao Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+L">Luziwei Leng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14909v1-abstract-short" style="display: inline;"> Known as low energy consumption networks, spiking neural networks (SNNs) have gained a lot of attention within the past decades. While SNNs are increasing competitive with artificial neural networks (ANNs) for vision tasks, they are rarely used for long sequence tasks, despite their intrinsic temporal dynamics. In this work, we develop spiking state space models (SpikingSSMs) for long sequence lea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14909v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14909v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14909v1-abstract-full" style="display: none;"> Known as low energy consumption networks, spiking neural networks (SNNs) have gained a lot of attention within the past decades. While SNNs are increasing competitive with artificial neural networks (ANNs) for vision tasks, they are rarely used for long sequence tasks, despite their intrinsic temporal dynamics. In this work, we develop spiking state space models (SpikingSSMs) for long sequence learning by leveraging on the sequence learning abilities of state space models (SSMs). Inspired by dendritic neuron structure, we hierarchically integrate neuronal dynamics with the original SSM block, meanwhile realizing sparse synaptic computation. Furthermore, to solve the conflict of event-driven neuronal dynamics with parallel computing, we propose a light-weight surrogate dynamic network which accurately predicts the after-reset membrane potential and compatible to learnable thresholds, enabling orders of acceleration in training speed compared with conventional iterative methods. On the long range arena benchmark task, SpikingSSM achieves competitive performance to state-of-the-art SSMs meanwhile realizing on average 90\% of network sparsity. On language modeling, our network significantly surpasses existing spiking large language models (spikingLLMs) on the WikiText-103 dataset with only a third of the model size, demonstrating its potential as backbone architecture for low computation cost LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14909v1-abstract-full').style.display = 'none'; document.getElementById('2408.14909v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13742">arXiv:2408.13742</a> <span> [<a href="https://arxiv.org/pdf/2408.13742">pdf</a>, <a href="https://arxiv.org/format/2408.13742">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Integrated Prediction and Decision-making with Adaptive Interaction Modality Explorations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Tong Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lu Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sikang Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13742v2-abstract-short" style="display: inline;"> Navigating dense and dynamic environments poses a significant challenge for autonomous driving systems, owing to the intricate nature of multimodal interaction, wherein the actions of various traffic participants and the autonomous vehicle are complex and implicitly coupled. In this paper, we propose a novel framework, Multi-modal Integrated predictioN and Decision-making (MIND), which addresses t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13742v2-abstract-full').style.display = 'inline'; document.getElementById('2408.13742v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13742v2-abstract-full" style="display: none;"> Navigating dense and dynamic environments poses a significant challenge for autonomous driving systems, owing to the intricate nature of multimodal interaction, wherein the actions of various traffic participants and the autonomous vehicle are complex and implicitly coupled. In this paper, we propose a novel framework, Multi-modal Integrated predictioN and Decision-making (MIND), which addresses the challenges by efficiently generating joint predictions and decisions covering multiple distinctive interaction modalities. Specifically, MIND leverages learning-based scenario predictions to obtain integrated predictions and decisions with social-consistent interaction modality and utilizes a modality-aware dynamic branching mechanism to generate scenario trees that efficiently capture the evolutions of distinctive interaction modalities with low variation of interaction uncertainty along the planning horizon. The scenario trees are seamlessly utilized by the contingency planning under interaction uncertainty to obtain clear and considerate maneuvers accounting for multi-modal evolutions. Comprehensive experimental results in the closed-loop simulation based on the real-world driving dataset showcase superior performance to other strong baselines under various driving contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13742v2-abstract-full').style.display = 'none'; document.getElementById('2408.13742v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11001">arXiv:2408.11001</a> <span> [<a href="https://arxiv.org/pdf/2408.11001">pdf</a>, <a href="https://arxiv.org/format/2408.11001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MegaFusion: Extend Diffusion Models towards Higher-resolution Image Generation without Further Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haoning Wu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaocheng Shen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qiang Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyun Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ya Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanfeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11001v3-abstract-short" style="display: inline;"> Diffusion models have emerged as frontrunners in text-to-image generation, but their fixed image resolution during training often leads to challenges in high-resolution image generation, such as semantic deviations and object replication. This paper introduces MegaFusion, a novel approach that extends existing diffusion-based text-to-image models towards efficient higher-resolution generation with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11001v3-abstract-full').style.display = 'inline'; document.getElementById('2408.11001v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11001v3-abstract-full" style="display: none;"> Diffusion models have emerged as frontrunners in text-to-image generation, but their fixed image resolution during training often leads to challenges in high-resolution image generation, such as semantic deviations and object replication. This paper introduces MegaFusion, a novel approach that extends existing diffusion-based text-to-image models towards efficient higher-resolution generation without additional fine-tuning or adaptation. Specifically, we employ an innovative truncate and relay strategy to bridge the denoising processes across different resolutions, allowing for high-resolution image generation in a coarse-to-fine manner. Moreover, by integrating dilated convolutions and noise re-scheduling, we further adapt the model's priors for higher resolution. The versatility and efficacy of MegaFusion make it universally applicable to both latent-space and pixel-space diffusion models, along with other derivative models. Extensive experiments confirm that MegaFusion significantly boosts the capability of existing models to produce images of megapixels and various aspect ratios, while only requiring about 40% of the original computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11001v3-abstract-full').style.display = 'none'; document.getElementById('2408.11001v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV 2025. Project Page: https://haoningwu3639.github.io/MegaFusion/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05459">arXiv:2408.05459</a> <span> [<a href="https://arxiv.org/pdf/2408.05459">pdf</a>, <a href="https://arxiv.org/format/2408.05459">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s00778-024-00875-8">10.1007/s00778-024-00875-8 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Versatile Framework for Attributed Network Clustering via K-Nearest Neighbor Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiran Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+G">Gongyao Guo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jieming Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+R">Renchi Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shiqi Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qing Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jun Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05459v2-abstract-short" style="display: inline;"> Attributed networks containing entity-specific information in node attributes are ubiquitous in modeling social networks, e-commerce, bioinformatics, etc. Their inherent network topology ranges from simple graphs to hypergraphs with high-order interactions and multiplex graphs with separate layers. An important graph mining task is node clustering, aiming to partition the nodes of an attributed ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05459v2-abstract-full').style.display = 'inline'; document.getElementById('2408.05459v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05459v2-abstract-full" style="display: none;"> Attributed networks containing entity-specific information in node attributes are ubiquitous in modeling social networks, e-commerce, bioinformatics, etc. Their inherent network topology ranges from simple graphs to hypergraphs with high-order interactions and multiplex graphs with separate layers. An important graph mining task is node clustering, aiming to partition the nodes of an attributed network into k disjoint clusters such that intra-cluster nodes are closely connected and share similar attributes, while inter-cluster nodes are far apart and dissimilar. It is highly challenging to capture multi-hop connections via nodes or attributes for effective clustering on multiple types of attributed networks. In this paper, we first present AHCKA as an efficient approach to attributed hypergraph clustering (AHC). AHCKA includes a carefully-crafted K-nearest neighbor augmentation strategy for the optimized exploitation of attribute information on hypergraphs, a joint hypergraph random walk model to devise an effective AHC objective, and an efficient solver with speedup techniques for the objective optimization. The proposed techniques are extensible to various types of attributed networks, and thus, we develop ANCKA as a versatile attributed network clustering framework, capable of attributed graph clustering (AGC), attributed multiplex graph clustering (AMGC), and AHC. Moreover, we devise ANCKA with algorithmic designs tailored for GPU acceleration to boost efficiency. We have conducted extensive experiments to compare our methods with 19 competitors on 8 attributed hypergraphs, 16 competitors on 6 attributed graphs, and 16 competitors on 3 attributed multiplex graphs, all demonstrating the superb clustering quality and efficiency of our methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05459v2-abstract-full').style.display = 'none'; document.getElementById('2408.05459v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 15 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> The VLDB Journal (2024) 1-31 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05109">arXiv:2408.05109</a> <span> [<a href="https://arxiv.org/pdf/2408.05109">pdf</a>, <a href="https://arxiv.org/format/2408.05109">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> A Survey of NL2SQL with Large Language Models: Where are we, and where are we going? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyu Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shuyu Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyan Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+P">Peixian Ma</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Runzhi Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Ju Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoliang Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+N">Nan Tang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuyu Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05109v2-abstract-short" style="display: inline;"> Translating users' natural language queries (NL) into SQL queries (i.e., NL2SQL) can significantly reduce barriers to accessing relational databases and support various commercial applications. The performance of NL2SQL has been greatly enhanced with the emergence of Large Language Models (LLMs). In this survey, we provide a comprehensive review of NL2SQL techniques powered by LLMs, covering its e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05109v2-abstract-full').style.display = 'inline'; document.getElementById('2408.05109v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05109v2-abstract-full" style="display: none;"> Translating users' natural language queries (NL) into SQL queries (i.e., NL2SQL) can significantly reduce barriers to accessing relational databases and support various commercial applications. The performance of NL2SQL has been greatly enhanced with the emergence of Large Language Models (LLMs). In this survey, we provide a comprehensive review of NL2SQL techniques powered by LLMs, covering its entire lifecycle from the following four aspects: (1) Model: NL2SQL translation techniques that tackle not only NL ambiguity and under-specification, but also properly map NL with database schema and instances; (2) Data: From the collection of training data, data synthesis due to training data scarcity, to NL2SQL benchmarks; (3) Evaluation: Evaluating NL2SQL methods from multiple angles using different metrics and granularities; and (4) Error Analysis: analyzing NL2SQL errors to find the root cause and guiding NL2SQL models to evolve. Moreover, we provide a rule of thumb for developing NL2SQL solutions. Finally, we discuss the research challenges and open problems of NL2SQL in the LLMs era. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05109v2-abstract-full').style.display = 'none'; document.getElementById('2408.05109v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21783">arXiv:2407.21783</a> <span> [<a href="https://arxiv.org/pdf/2407.21783">pdf</a>, <a href="https://arxiv.org/format/2407.21783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Llama 3 Herd of Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grattafiori%2C+A">Aaron Grattafiori</a>, <a href="/search/cs?searchtype=author&query=Dubey%2C+A">Abhimanyu Dubey</a>, <a href="/search/cs?searchtype=author&query=Jauhri%2C+A">Abhinav Jauhri</a>, <a href="/search/cs?searchtype=author&query=Pandey%2C+A">Abhinav Pandey</a>, <a href="/search/cs?searchtype=author&query=Kadian%2C+A">Abhishek Kadian</a>, <a href="/search/cs?searchtype=author&query=Al-Dahle%2C+A">Ahmad Al-Dahle</a>, <a href="/search/cs?searchtype=author&query=Letman%2C+A">Aiesha Letman</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+A">Akhil Mathur</a>, <a href="/search/cs?searchtype=author&query=Schelten%2C+A">Alan Schelten</a>, <a href="/search/cs?searchtype=author&query=Vaughan%2C+A">Alex Vaughan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">Amy Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+A">Angela Fan</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+A">Anirudh Goyal</a>, <a href="/search/cs?searchtype=author&query=Hartshorn%2C+A">Anthony Hartshorn</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">Aobo Yang</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+A">Archi Mitra</a>, <a href="/search/cs?searchtype=author&query=Sravankumar%2C+A">Archie Sravankumar</a>, <a href="/search/cs?searchtype=author&query=Korenev%2C+A">Artem Korenev</a>, <a href="/search/cs?searchtype=author&query=Hinsvark%2C+A">Arthur Hinsvark</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+A">Arun Rao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aston Zhang</a>, <a href="/search/cs?searchtype=author&query=Rodriguez%2C+A">Aurelien Rodriguez</a>, <a href="/search/cs?searchtype=author&query=Gregerson%2C+A">Austen Gregerson</a>, <a href="/search/cs?searchtype=author&query=Spataru%2C+A">Ava Spataru</a>, <a href="/search/cs?searchtype=author&query=Roziere%2C+B">Baptiste Roziere</a> , et al. (536 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21783v3-abstract-short" style="display: inline;"> Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21783v3-abstract-full').style.display = 'inline'; document.getElementById('2407.21783v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21783v3-abstract-full" style="display: none;"> Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical evaluation of Llama 3. We find that Llama 3 delivers comparable quality to leading language models such as GPT-4 on a plethora of tasks. We publicly release Llama 3, including pre-trained and post-trained versions of the 405B parameter language model and our Llama Guard 3 model for input and output safety. The paper also presents the results of experiments in which we integrate image, video, and speech capabilities into Llama 3 via a compositional approach. We observe this approach performs competitively with the state-of-the-art on image, video, and speech recognition tasks. The resulting models are not yet being broadly released as they are still under development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21783v3-abstract-full').style.display = 'none'; document.getElementById('2407.21783v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20119">arXiv:2407.20119</a> <span> [<a href="https://arxiv.org/pdf/2407.20119">pdf</a>, <a href="https://arxiv.org/ps/2407.20119">ps</a>, <a href="https://arxiv.org/format/2407.20119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Self-supervised Robust Clustering for Unstructured Data with Unknown Cluster Number </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+C">Chen-Lu Ding</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiancan Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wei Lin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shiyang Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yancheng Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20119v2-abstract-short" style="display: inline;"> We introduce a novel self-supervised deep clustering approach tailored for unstructured data without requiring prior knowledge of the number of clusters, termed Adaptive Self-supervised Robust Clustering (ASRC). In particular, ASRC adaptively learns the graph structure and edge weights to capture both local and global structural information. The obtained graph enables us to learn clustering-friend… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20119v2-abstract-full').style.display = 'inline'; document.getElementById('2407.20119v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20119v2-abstract-full" style="display: none;"> We introduce a novel self-supervised deep clustering approach tailored for unstructured data without requiring prior knowledge of the number of clusters, termed Adaptive Self-supervised Robust Clustering (ASRC). In particular, ASRC adaptively learns the graph structure and edge weights to capture both local and global structural information. The obtained graph enables us to learn clustering-friendly feature representations by an enhanced graph auto-encoder with contrastive learning technique. It further leverages the clustering results adaptively obtained by robust continuous clustering (RCC) to generate prototypes for negative sampling, which can further contribute to promoting consistency among positive pairs and enlarging the gap between positive and negative samples. ASRC obtains the final clustering results by applying RCC to the learned feature representations with their consistent graph structure and edge weights. Extensive experiments conducted on seven benchmark datasets demonstrate the efficacy of ASRC, demonstrating its superior performance over other popular clustering models. Notably, ASRC even outperforms methods that rely on prior knowledge of the number of clusters, highlighting its effectiveness in addressing the challenges of clustering unstructured data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20119v2-abstract-full').style.display = 'none'; document.getElementById('2407.20119v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10439">arXiv:2407.10439</a> <span> [<a href="https://arxiv.org/pdf/2407.10439">pdf</a>, <a href="https://arxiv.org/format/2407.10439">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PolyRoom: Room-aware Transformer for Floorplan Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuzhou Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lingjie Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaodong Ma</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hanqiao Ye</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiang Gao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xianwei Zheng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shuhan Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10439v1-abstract-short" style="display: inline;"> Reconstructing geometry and topology structures from raw unstructured data has always been an important research topic in indoor mapping research. In this paper, we aim to reconstruct the floorplan with a vectorized representation from point clouds. Despite significant advancements achieved in recent years, current methods still encounter several challenges, such as missing corners or edges, inacc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10439v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10439v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10439v1-abstract-full" style="display: none;"> Reconstructing geometry and topology structures from raw unstructured data has always been an important research topic in indoor mapping research. In this paper, we aim to reconstruct the floorplan with a vectorized representation from point clouds. Despite significant advancements achieved in recent years, current methods still encounter several challenges, such as missing corners or edges, inaccuracies in corner positions or angles, self-intersecting or overlapping polygons, and potentially implausible topology. To tackle these challenges, we present PolyRoom, a room-aware Transformer that leverages uniform sampling representation, room-aware query initialization, and room-aware self-attention for floorplan reconstruction. Specifically, we adopt a uniform sampling floorplan representation to enable dense supervision during training and effective utilization of angle information. Additionally, we propose a room-aware query initialization scheme to prevent non-polygonal sequences and introduce room-aware self-attention to enhance memory efficiency and model performance. Experimental results on two widely used datasets demonstrate that PolyRoom surpasses current state-of-the-art methods both quantitatively and qualitatively. Our code is available at: https://github.com/3dv-casia/PolyRoom/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10439v1-abstract-full').style.display = 'none'; document.getElementById('2407.10439v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10101">arXiv:2407.10101</a> <span> [<a href="https://arxiv.org/pdf/2407.10101">pdf</a>, <a href="https://arxiv.org/format/2407.10101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> WING: Wheel-Inertial Neural Odometry with Ground Manifold Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chenxing Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kunyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sheng Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+F">Fei Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10101v2-abstract-short" style="display: inline;"> In this paper, we propose an interoceptive-only odometry system for ground robots with neural network processing and soft constraints based on the assumption of a globally continuous ground manifold. Exteroceptive sensors such as cameras, GPS and LiDAR may encounter difficulties in scenarios with poor illumination, indoor environments, dusty areas and straight tunnels. Therefore, improving the pos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10101v2-abstract-full').style.display = 'inline'; document.getElementById('2407.10101v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10101v2-abstract-full" style="display: none;"> In this paper, we propose an interoceptive-only odometry system for ground robots with neural network processing and soft constraints based on the assumption of a globally continuous ground manifold. Exteroceptive sensors such as cameras, GPS and LiDAR may encounter difficulties in scenarios with poor illumination, indoor environments, dusty areas and straight tunnels. Therefore, improving the pose estimation accuracy only using interoceptive sensors is important to enhance the reliability of navigation system even in degrading scenarios mentioned above. However, interoceptive sensors like IMU and wheel encoders suffer from large drift due to noisy measurements. To overcome these challenges, the proposed system trains deep neural networks to correct the measurements from IMU and wheel encoders, while considering their uncertainty. Moreover, because ground robots can only travel on the ground, we model the ground surface as a globally continuous manifold using a dual cubic B-spline manifold to further improve the estimation accuracy by this soft constraint. A novel space-based sliding-window filtering framework is proposed to fully exploit the $C^2$ continuity of ground manifold soft constraints and fuse all the information from raw measurements and neural networks in a yaw-independent attitude convention. Extensive experiments demonstrate that our proposed approach can outperform state-of-the-art learning-based interoceptive-only odometry methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10101v2-abstract-full').style.display = 'none'; document.getElementById('2407.10101v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07324">arXiv:2407.07324</a> <span> [<a href="https://arxiv.org/pdf/2407.07324">pdf</a>, <a href="https://arxiv.org/format/2407.07324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Event-Aided Time-to-Collision Estimation for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinghang Li</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+B">Bangyan Liao</a>, <a href="/search/cs?searchtype=author&query=LU%2C+X">Xiuyuan LU</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peidong Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07324v2-abstract-short" style="display: inline;"> Predicting a potential collision with leading vehicles is an essential functionality of any autonomous/assisted driving system. One bottleneck of existing vision-based solutions is that their updating rate is limited to the frame rate of standard cameras used. In this paper, we present a novel method that estimates the time to collision using a neuromorphic event-based camera, a biologically inspi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07324v2-abstract-full').style.display = 'inline'; document.getElementById('2407.07324v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07324v2-abstract-full" style="display: none;"> Predicting a potential collision with leading vehicles is an essential functionality of any autonomous/assisted driving system. One bottleneck of existing vision-based solutions is that their updating rate is limited to the frame rate of standard cameras used. In this paper, we present a novel method that estimates the time to collision using a neuromorphic event-based camera, a biologically inspired visual sensor that can sense at exactly the same rate as scene dynamics. The core of the proposed algorithm consists of a two-step approach for efficient and accurate geometric model fitting on event data in a coarse-to-fine manner. The first step is a robust linear solver based on a novel geometric measurement that overcomes the partial observability of event-based normal flow. The second step further refines the resulting model via a spatio-temporal registration process formulated as a nonlinear optimization problem. Experiments on both synthetic and real data demonstrate the effectiveness of the proposed method, outperforming other alternative methods in terms of efficiency and accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07324v2-abstract-full').style.display = 'none'; document.getElementById('2407.07324v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to European Conference on Computer Vision 2024, dataset used in this paper can be found at https://nail-hnu.github.io/EventAidedTTC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01864">arXiv:2407.01864</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Research on target detection method of distracted driving behavior based on improved YOLOv8 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shiquan Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhizhong Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01864v2-abstract-short" style="display: inline;"> With the development of deep learning technology, the detection and classification of distracted driving behaviour requires higher accuracy. Existing deep learning-based methods are computationally intensive and parameter redundant, limiting the efficiency and accuracy in practical applications. To solve this problem, this study proposes an improved YOLOv8 detection method based on the original YO… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01864v2-abstract-full').style.display = 'inline'; document.getElementById('2407.01864v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01864v2-abstract-full" style="display: none;"> With the development of deep learning technology, the detection and classification of distracted driving behaviour requires higher accuracy. Existing deep learning-based methods are computationally intensive and parameter redundant, limiting the efficiency and accuracy in practical applications. To solve this problem, this study proposes an improved YOLOv8 detection method based on the original YOLOv8 model by integrating the BoTNet module, GAM attention mechanism and EIoU loss function. By optimising the feature extraction and multi-scale feature fusion strategies, the training and inference processes are simplified, and the detection accuracy and efficiency are significantly improved. Experimental results show that the improved model performs well in both detection speed and accuracy, with an accuracy rate of 99.4%, and the model is smaller and easy to deploy, which is able to identify and classify distracted driving behaviours in real time, provide timely warnings, and enhance driving safety. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01864v2-abstract-full').style.display = 'none'; document.getElementById('2407.01864v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Major revision on content, no replacement available soon</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00578">arXiv:2407.00578</a> <span> [<a href="https://arxiv.org/pdf/2407.00578">pdf</a>, <a href="https://arxiv.org/format/2407.00578">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> UniQuad: A Unified and Versatile Quadrotor Platform Series for UAV Research and Application </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyi Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peize Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junzhe Wang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H">Hetai Zou</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+N">Neng Pan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+F">Fei Gao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00578v2-abstract-short" style="display: inline;"> As quadrotors take on an increasingly diverse range of roles, researchers often need to develop new hardware platforms tailored for specific tasks, introducing significant engineering overhead. In this article, we introduce the UniQuad series, a unified and versatile quadrotor platform series that offers high flexibility to adapt to a wide range of common tasks, excellent customizability for advan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00578v2-abstract-full').style.display = 'inline'; document.getElementById('2407.00578v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00578v2-abstract-full" style="display: none;"> As quadrotors take on an increasingly diverse range of roles, researchers often need to develop new hardware platforms tailored for specific tasks, introducing significant engineering overhead. In this article, we introduce the UniQuad series, a unified and versatile quadrotor platform series that offers high flexibility to adapt to a wide range of common tasks, excellent customizability for advanced demands, and easy maintenance in case of crashes. This project is fully open-source at https://hkust-aerial-robotics.github.io/UniQuad. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00578v2-abstract-full').style.display = 'none'; document.getElementById('2407.00578v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to 40th Anniversary of the IEEE Conference on Robotics and Automation (ICRA-X40)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00577">arXiv:2407.00577</a> <span> [<a href="https://arxiv.org/pdf/2407.00577">pdf</a>, <a href="https://arxiv.org/format/2407.00577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> FALCON: Fast Autonomous Aerial Exploration using Coverage Path Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyi Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Boyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00577v1-abstract-short" style="display: inline;"> This paper introduces FALCON, a novel Fast Autonomous expLoration framework using COverage path guidaNce, which aims at setting a new performance benchmark in the field of autonomous aerial exploration. Despite recent advancements in the domain, existing exploration planners often suffer from inefficiencies such as frequent revisitations of previously explored regions. FALCON effectively harnesses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00577v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00577v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00577v1-abstract-full" style="display: none;"> This paper introduces FALCON, a novel Fast Autonomous expLoration framework using COverage path guidaNce, which aims at setting a new performance benchmark in the field of autonomous aerial exploration. Despite recent advancements in the domain, existing exploration planners often suffer from inefficiencies such as frequent revisitations of previously explored regions. FALCON effectively harnesses the full potential of online generated coverage paths in enhancing exploration efficiency. The framework begins with an incremental connectivity-aware space decomposition and connectivity graph construction, which facilitate efficient coverage path planning. Subsequently, a hierarchical planner generates a coverage path spanning the entire unexplored space, serving as a global guidance. Then, a local planner optimizes the frontier visitation order, minimizing traversal time while consciously incorporating the intention of the global guidance. Finally, minimum-time smooth and safe trajectories are produced to visit the frontier viewpoints. For fair and comprehensive benchmark experiments, we introduce a lightweight exploration planner evaluation environment that allows for comparing exploration planners across a variety of testing scenarios using an identical quadrotor simulator. Additionally, a VECO criteria is proposed for an in-depth analysis of FALCON's significant performance in comparison with the state-of-the-art exploration planners. Extensive ablation studies demonstrate the effectiveness of each component in the proposed framework. Real-world experiments conducted fully onboard further validate FALCON's practical capability in complex and challenging environments. The source code of both the exploration planner FALCON and the exploration planner evaluation environment will be released to benefit the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00577v1-abstract-full').style.display = 'none'; document.getElementById('2407.00577v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16978">arXiv:2406.16978</a> <span> [<a href="https://arxiv.org/pdf/2406.16978">pdf</a>, <a href="https://arxiv.org/format/2406.16978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> MetaFollower: Adaptable Personalized Autonomous Car Following </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianda Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kehua Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Meixin Zhu</a>, <a href="/search/cs?searchtype=author&query=Hao"> Hao</a>, <a href="/search/cs?searchtype=author&query=Yang"> Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuesong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yinhai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16978v1-abstract-short" style="display: inline;"> Car-following (CF) modeling, a fundamental component in microscopic traffic simulation, has attracted increasing interest of researchers in the past decades. In this study, we propose an adaptable personalized car-following framework -MetaFollower, by leveraging the power of meta-learning. Specifically, we first utilize Model-Agnostic Meta-Learning (MAML) to extract common driving knowledge from v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16978v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16978v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16978v1-abstract-full" style="display: none;"> Car-following (CF) modeling, a fundamental component in microscopic traffic simulation, has attracted increasing interest of researchers in the past decades. In this study, we propose an adaptable personalized car-following framework -MetaFollower, by leveraging the power of meta-learning. Specifically, we first utilize Model-Agnostic Meta-Learning (MAML) to extract common driving knowledge from various CF events. Afterward, the pre-trained model can be fine-tuned on new drivers with only a few CF trajectories to achieve personalized CF adaptation. We additionally combine Long Short-Term Memory (LSTM) and Intelligent Driver Model (IDM) to reflect temporal heterogeneity with high interpretability. Unlike conventional adaptive cruise control (ACC) systems that rely on predefined settings and constant parameters without considering heterogeneous driving characteristics, MetaFollower can accurately capture and simulate the intricate dynamics of car-following behavior while considering the unique driving styles of individual drivers. We demonstrate the versatility and adaptability of MetaFollower by showcasing its ability to adapt to new drivers with limited training data quickly. To evaluate the performance of MetaFollower, we conduct rigorous experiments comparing it with both data-driven and physics-based models. The results reveal that our proposed framework outperforms baseline models in predicting car-following behavior with higher accuracy and safety. To the best of our knowledge, this is the first car-following model aiming to achieve fast adaptation by considering both driver and temporal heterogeneity based on meta-learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16978v1-abstract-full').style.display = 'none'; document.getElementById('2406.16978v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14288">arXiv:2406.14288</a> <span> [<a href="https://arxiv.org/pdf/2406.14288">pdf</a>, <a href="https://arxiv.org/format/2406.14288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Modularity Maximization for Graph Clustering: A Contrastive Learning Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunfei Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jintang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuehe Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Ruofan Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+E">Ericbk Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jing Zhou</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+S">Sheng Tian</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shuheng Shen</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xing Fu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+C">Changhua Meng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14288v1-abstract-short" style="display: inline;"> Graph clustering, a fundamental and challenging task in graph mining, aims to classify nodes in a graph into several disjoint clusters. In recent years, graph contrastive learning (GCL) has emerged as a dominant line of research in graph clustering and advances the new state-of-the-art. However, GCL-based methods heavily rely on graph augmentations and contrastive schemes, which may potentially in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14288v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14288v1-abstract-full" style="display: none;"> Graph clustering, a fundamental and challenging task in graph mining, aims to classify nodes in a graph into several disjoint clusters. In recent years, graph contrastive learning (GCL) has emerged as a dominant line of research in graph clustering and advances the new state-of-the-art. However, GCL-based methods heavily rely on graph augmentations and contrastive schemes, which may potentially introduce challenges such as semantic drift and scalability issues. Another promising line of research involves the adoption of modularity maximization, a popular and effective measure for community detection, as the guiding principle for clustering tasks. Despite the recent progress, the underlying mechanism of modularity maximization is still not well understood. In this work, we dig into the hidden success of modularity maximization for graph clustering. Our analysis reveals the strong connections between modularity maximization and graph contrastive learning, where positive and negative examples are naturally defined by modularity. In light of our results, we propose a community-aware graph clustering framework, coined MAGI, which leverages modularity maximization as a contrastive pretext task to effectively uncover the underlying information of communities in graphs, while avoiding the problem of semantic drift. Extensive experiments on multiple graph datasets verify the effectiveness of MAGI in terms of scalability and clustering performance compared to state-of-the-art graph clustering methods. Notably, MAGI easily scales a sufficiently large graph with 100M nodes while outperforming strong baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14288v1-abstract-full').style.display = 'none'; document.getElementById('2406.14288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">KDD 2024 research track. Code available at https://github.com/EdisonLeeeee/MAGI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14250">arXiv:2406.14250</a> <span> [<a href="https://arxiv.org/pdf/2406.14250">pdf</a>, <a href="https://arxiv.org/format/2406.14250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> E-ANT: A Large-Scale Dataset for Efficient Automatic GUI NavigaTion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tianyu Xia</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Z">Zhangxuan Gu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yi Zhao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shuheng Shen</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+C">Changhua Meng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Ke Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14250v3-abstract-short" style="display: inline;"> Online GUI navigation on mobile devices has driven a lot of attention recent years since it contributes to many real-world applications. With the rapid development of large language models (LLM), multimodal large language models (MLLM) have tremendous potential on this task. However, existing MLLMs need high quality data to improve its abilities of making the correct navigation decisions according… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14250v3-abstract-full').style.display = 'inline'; document.getElementById('2406.14250v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14250v3-abstract-full" style="display: none;"> Online GUI navigation on mobile devices has driven a lot of attention recent years since it contributes to many real-world applications. With the rapid development of large language models (LLM), multimodal large language models (MLLM) have tremendous potential on this task. However, existing MLLMs need high quality data to improve its abilities of making the correct navigation decisions according to the human user inputs. In this paper, we developed a novel and highly valuable dataset, named \textbf{E-ANT}, as the first Chinese GUI navigation dataset that contains real human behaviour and high quality screenshots with annotations, containing nearly 40,000 real human traces over 5000+ different tinyAPPs. Furthermore, we evaluate various powerful MLLMs on E-ANT and show their experiments results with sufficient ablations. We believe that our proposed dataset will be beneficial for both the evaluation and development of GUI navigation and LLM/MLLM decision-making capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14250v3-abstract-full').style.display = 'none'; document.getElementById('2406.14250v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 5 figures, Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13450">arXiv:2406.13450</a> <span> [<a href="https://arxiv.org/pdf/2406.13450">pdf</a>, <a href="https://arxiv.org/format/2406.13450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Federating to Grow Transformers with Constrained Resources without Model Sharing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shikun Shen</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yifei Zou</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yanwei Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xiuzhen Cheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dongxiao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13450v1-abstract-short" style="display: inline;"> The high resource consumption of large-scale models discourages resource-constrained users from developing their customized transformers. To this end, this paper considers a federated framework named Fed-Grow for multiple participants to cooperatively scale a transformer from their pre-trained small models. Under the Fed-Grow, a Dual-LiGO (Dual Linear Growth Operator) architecture is designed to h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13450v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13450v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13450v1-abstract-full" style="display: none;"> The high resource consumption of large-scale models discourages resource-constrained users from developing their customized transformers. To this end, this paper considers a federated framework named Fed-Grow for multiple participants to cooperatively scale a transformer from their pre-trained small models. Under the Fed-Grow, a Dual-LiGO (Dual Linear Growth Operator) architecture is designed to help participants expand their pre-trained small models to a transformer. In Dual-LiGO, the Local-LiGO part is used to address the heterogeneity problem caused by the various pre-trained models, and the Global-LiGO part is shared to exchange the implicit knowledge from the pre-trained models, local data, and training process of participants. Instead of model sharing, only sharing the Global-LiGO strengthens the privacy of our approach. Compared with several state-of-the-art methods in simulation, our approach has higher accuracy, better precision, and lower resource consumption on computations and communications. To the best of our knowledge, most of the previous model-scaling works are centralized, and our work is the first one that cooperatively grows a transformer from multiple pre-trained heterogeneous models with the user privacy protected in terms of local data and models. We hope that our approach can extend the transformers to the broadly distributed scenarios and encourage more resource-constrained users to enjoy the bonus taken by the large-scale transformers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13450v1-abstract-full').style.display = 'none'; document.getElementById('2406.13450v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12552">arXiv:2406.12552</a> <span> [<a href="https://arxiv.org/pdf/2406.12552">pdf</a>, <a href="https://arxiv.org/ps/2406.12552">ps</a>, <a href="https://arxiv.org/format/2406.12552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s41965-024-00156-x">10.1007/s41965-024-00156-x <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Evolutionary Spiking Neural Networks: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shuaijie Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chao Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Renzhuo Huang</a>, <a href="/search/cs?searchtype=author&query=Tuerhong%2C+A">Aiersi Tuerhong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qinghai Guo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhichao Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+L">Luziwei Leng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12552v1-abstract-short" style="display: inline;"> Spiking neural networks (SNNs) are gaining increasing attention as potential computationally efficient alternatives to traditional artificial neural networks(ANNs). However, the unique information propagation mechanisms and the complexity of SNN neuron models pose challenges for adopting traditional methods developed for ANNs to SNNs. These challenges include both weight learning and architecture… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12552v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12552v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12552v1-abstract-full" style="display: none;"> Spiking neural networks (SNNs) are gaining increasing attention as potential computationally efficient alternatives to traditional artificial neural networks(ANNs). However, the unique information propagation mechanisms and the complexity of SNN neuron models pose challenges for adopting traditional methods developed for ANNs to SNNs. These challenges include both weight learning and architecture design. While surrogate gradient learning has shown some success in addressing the former challenge, the latter remains relatively unexplored. Recently, a novel paradigm utilizing evolutionary computation methods has emerged to tackle these challenges. This approach has resulted in the development of a variety of energy-efficient and high-performance SNNs across a wide range of machine learning benchmarks. In this paper, we present a survey of these works and initiate discussions on potential challenges ahead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12552v1-abstract-full').style.display = 'none'; document.getElementById('2406.12552v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> J Membr Comput (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12020">arXiv:2406.12020</a> <span> [<a href="https://arxiv.org/pdf/2406.12020">pdf</a>, <a href="https://arxiv.org/format/2406.12020">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> When Box Meets Graph Neural Network in Tag-aware Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fake Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ziwei Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Da Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shitian Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xueying Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Suojuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+E">Enhong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12020v1-abstract-short" style="display: inline;"> Last year has witnessed the re-flourishment of tag-aware recommender systems supported by the LLM-enriched tags. Unfortunately, though large efforts have been made, current solutions may fail to describe the diversity and uncertainty inherent in user preferences with only tag-driven profiles. Recently, with the development of geometry-based techniques, e.g., box embedding, diversity of user prefer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12020v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12020v1-abstract-full" style="display: none;"> Last year has witnessed the re-flourishment of tag-aware recommender systems supported by the LLM-enriched tags. Unfortunately, though large efforts have been made, current solutions may fail to describe the diversity and uncertainty inherent in user preferences with only tag-driven profiles. Recently, with the development of geometry-based techniques, e.g., box embedding, diversity of user preferences now could be fully modeled as the range within a box in high dimension space. However, defect still exists as these approaches are incapable of capturing high-order neighbor signals, i.e., semantic-rich multi-hop relations within the user-tag-item tripartite graph, which severely limits the effectiveness of user modeling. To deal with this challenge, in this paper, we propose a novel algorithm, called BoxGNN, to perform the message aggregation via combination of logical operations, thereby incorporating high-order signals. Specifically, we first embed users, items, and tags as hyper-boxes rather than simple points in the representation space, and define two logical operations to facilitate the subsequent process. Next, we perform the message aggregation mechanism via the combination of logical operations, to obtain the corresponding high-order box representations. Finally, we adopt a volume-based learning objective with Gumbel smoothing techniques to refine the representation of boxes. Extensive experiments on two publicly available datasets and one LLM-enhanced e-commerce dataset have validated the superiority of BoxGNN compared with various state-of-the-art baselines. The code is released online <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12020v1-abstract-full').style.display = 'none'; document.getElementById('2406.12020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11431">arXiv:2406.11431</a> <span> [<a href="https://arxiv.org/pdf/2406.11431">pdf</a>, <a href="https://arxiv.org/format/2406.11431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Super(ficial)-alignment: Strong Models May Deceive Weak Models in Weak-to-Strong Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenkai Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shiqi Shen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+G">Guangyao Shen</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Wei Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zhi Gong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yankai Lin</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11431v2-abstract-short" style="display: inline;"> Superalignment, where humans act as weak supervisors for superhuman models, has become a crucial problem with the rapid development of Large Language Models (LLMs). Recent work has preliminarily studied this problem by using weak models to supervise strong models, and discovered that weakly supervised strong students can consistently outperform weak teachers towards the alignment target, leading t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11431v2-abstract-full').style.display = 'inline'; document.getElementById('2406.11431v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11431v2-abstract-full" style="display: none;"> Superalignment, where humans act as weak supervisors for superhuman models, has become a crucial problem with the rapid development of Large Language Models (LLMs). Recent work has preliminarily studied this problem by using weak models to supervise strong models, and discovered that weakly supervised strong students can consistently outperform weak teachers towards the alignment target, leading to a weak-to-strong generalization phenomenon. However, we are concerned that behind such a promising phenomenon, whether there exists an issue of weak-to-strong deception, where strong models deceive weak models by exhibiting well-aligned in areas known to weak models but producing misaligned behaviors in cases weak models do not know. We take an initial step towards exploring this security issue in a specific but realistic multi-objective alignment case, where there may be some alignment targets conflicting with each other (e.g., helpfulness v.s. harmlessness). We aim to explore whether, in such cases, strong models might deliberately make mistakes in areas known to them but unknown to weak models within one alignment dimension, in exchange for a higher reward in another dimension. Through extensive experiments in both the reward modeling and preference optimization scenarios, we find: (1) The weak-to-strong deception phenomenon exists across all settings. (2) The deception intensifies as the capability gap between weak and strong models increases. (3) Bootstrapping with an intermediate model can mitigate the deception to some extent, though its effectiveness remains limited. Our work highlights the urgent need to pay more attention to the true reliability of superalignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11431v2-abstract-full').style.display = 'none'; document.getElementById('2406.11431v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/keven980716/weak-to-strong-deception</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11271">arXiv:2406.11271</a> <span> [<a href="https://arxiv.org/pdf/2406.11271">pdf</a>, <a href="https://arxiv.org/format/2406.11271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MINT-1T: Scaling Open-Source Multimodal Data by 10x: A Multimodal Dataset with One Trillion Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Awadalla%2C+A">Anas Awadalla</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Le Xue</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+O">Oscar Lo</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+M">Manli Shu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hannah Lee</a>, <a href="/search/cs?searchtype=author&query=Guha%2C+E+K">Etash Kumar Guha</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+M">Matt Jordan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sheng Shen</a>, <a href="/search/cs?searchtype=author&query=Awadalla%2C+M">Mohamed Awadalla</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ran Xu</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Schmidt%2C+L">Ludwig Schmidt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11271v5-abstract-short" style="display: inline;"> Multimodal interleaved datasets featuring free-form interleaved sequences of images and text are crucial for training frontier large multimodal models (LMMs). Despite the rapid progression of open-source LMMs, there remains a pronounced scarcity of large-scale, diverse open-source multimodal interleaved datasets. In response, we introduce MINT-1T, the most extensive and diverse open-source Multimo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11271v5-abstract-full').style.display = 'inline'; document.getElementById('2406.11271v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11271v5-abstract-full" style="display: none;"> Multimodal interleaved datasets featuring free-form interleaved sequences of images and text are crucial for training frontier large multimodal models (LMMs). Despite the rapid progression of open-source LMMs, there remains a pronounced scarcity of large-scale, diverse open-source multimodal interleaved datasets. In response, we introduce MINT-1T, the most extensive and diverse open-source Multimodal INTerleaved dataset to date. MINT-1T comprises one trillion text tokens and 3.4 billion images, a 10x scale-up from existing open-source datasets. Additionally, we include previously untapped sources such as PDFs and ArXiv papers. As scaling multimodal interleaved datasets requires substantial engineering effort, sharing the data curation process and releasing the dataset greatly benefits the community. Our experiments show that LMMs trained on MINT-1T rival the performance of models trained on the previous leading dataset, OBELICS. Our data and code will be released at https://github.com/mlfoundations/MINT-1T. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11271v5-abstract-full').style.display = 'none'; document.getElementById('2406.11271v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07885">arXiv:2406.07885</a> <span> [<a href="https://arxiv.org/pdf/2406.07885">pdf</a>, <a href="https://arxiv.org/format/2406.07885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GENIU: A Restricted Data Access Unlearning for Imbalanced Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaofei Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yawen Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W+T">Weitong Tony Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Miao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07885v1-abstract-short" style="display: inline;"> With the increasing emphasis on data privacy, the significance of machine unlearning has grown substantially. Class unlearning, which involves enabling a trained model to forget data belonging to a specific class learned before, is important as classification tasks account for the majority of today's machine learning as a service (MLaaS). Retraining the model on the original data, excluding the da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07885v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07885v1-abstract-full" style="display: none;"> With the increasing emphasis on data privacy, the significance of machine unlearning has grown substantially. Class unlearning, which involves enabling a trained model to forget data belonging to a specific class learned before, is important as classification tasks account for the majority of today's machine learning as a service (MLaaS). Retraining the model on the original data, excluding the data to be forgotten (a.k.a forgetting data), is a common approach to class unlearning. However, the availability of original data during the unlearning phase is not always guaranteed, leading to the exploration of class unlearning with restricted data access. While current unlearning methods with restricted data access usually generate proxy sample via the trained neural network classifier, they typically focus on training and forgetting balanced data. However, the imbalanced original data can cause trouble for these proxies and unlearning, particularly when the forgetting data consists predominantly of the majority class. To address this issue, we propose the GENerative Imbalanced Unlearning (GENIU) framework. GENIU utilizes a Variational Autoencoder (VAE) to concurrently train a proxy generator alongside the original model. These generated proxies accurately represent each class and are leveraged in the unlearning phase, eliminating the reliance on the original training data. To further mitigate the performance degradation resulting from forgetting the majority class, we introduce an in-batch tuning strategy that works with the generated proxies. GENIU is the first practical framework for class unlearning in imbalanced data settings and restricted data access, ensuring the preservation of essential information for future unlearning. Experimental results confirm the superiority of GENIU over existing methods, establishing its effectiveness in empirical scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07885v1-abstract-full').style.display = 'none'; document.getElementById('2406.07885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07835">arXiv:2406.07835</a> <span> [<a href="https://arxiv.org/pdf/2406.07835">pdf</a>, <a href="https://arxiv.org/format/2406.07835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SciRIFF: A Resource to Enhance Language Model Instruction-Following over Scientific Literature </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wadden%2C+D">David Wadden</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+K">Kejian Shi</a>, <a href="/search/cs?searchtype=author&query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&query=Naik%2C+A">Aakanksha Naik</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">Shruti Singh</a>, <a href="/search/cs?searchtype=author&query=Barzilay%2C+N">Nitzan Barzilay</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+K">Kyle Lo</a>, <a href="/search/cs?searchtype=author&query=Hope%2C+T">Tom Hope</a>, <a href="/search/cs?searchtype=author&query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S+Z">Shannon Zejiang Shen</a>, <a href="/search/cs?searchtype=author&query=Downey%2C+D">Doug Downey</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&query=Cohan%2C+A">Arman Cohan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07835v3-abstract-short" style="display: inline;"> We present SciRIFF (Scientific Resource for Instruction-Following and Finetuning), a dataset of 137K instruction-following demonstrations for 54 tasks covering five essential scientific literature understanding capabilities: information extraction, summarization, question answering, claim verification, and classification. SciRIFF demonstrations are notable for their long input contexts, detailed t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07835v3-abstract-full').style.display = 'inline'; document.getElementById('2406.07835v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07835v3-abstract-full" style="display: none;"> We present SciRIFF (Scientific Resource for Instruction-Following and Finetuning), a dataset of 137K instruction-following demonstrations for 54 tasks covering five essential scientific literature understanding capabilities: information extraction, summarization, question answering, claim verification, and classification. SciRIFF demonstrations are notable for their long input contexts, detailed task specifications, and complex structured outputs. While instruction-following resources are available in specific domains such as clinical medicine and chemistry, SciRIFF is the first dataset focused on extracting and synthesizing information from research literature across a wide range of scientific fields. To demonstrate the utility of SciRIFF, we develop a sample-efficient strategy to adapt a general instruction-following model for science by performing additional finetuning on a mix of general-domain and SciRIFF demonstrations. In evaluations on nine held-out scientific tasks, our model -- called SciTulu -- improves over a strong LLM baseline by 28.1% and 6.5% at the 7B and 70B scales respectively, while maintaining general instruction-following performance within 2% of the baseline. We are optimistic that SciRIFF will facilitate the development and evaluation of LLMs to help researchers navigate the ever-growing body of scientific literature. We release our dataset, model checkpoints, and data processing and evaluation code to enable further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07835v3-abstract-full').style.display = 'none'; document.getElementById('2406.07835v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to NeurIPS Datasets and Benchmarks 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05628">arXiv:2406.05628</a> <span> [<a href="https://arxiv.org/pdf/2406.05628">pdf</a>, <a href="https://arxiv.org/format/2406.05628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Domain Generalization Guided by Large-Scale Pre-Trained Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zongbin Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+B">Bin Pan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shiyu Shen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyang Shi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenwei Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05628v1-abstract-short" style="display: inline;"> Domain generalization (DG) aims to train a model from limited source domains, allowing it to generalize to unknown target domains. Typically, DG models only employ large-scale pre-trained models during the initialization of fine-tuning. However, large-scale pre-trained models already possess the ability to resist domain shift. If we reference pre-trained models continuously during fine-tuning to m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05628v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05628v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05628v1-abstract-full" style="display: none;"> Domain generalization (DG) aims to train a model from limited source domains, allowing it to generalize to unknown target domains. Typically, DG models only employ large-scale pre-trained models during the initialization of fine-tuning. However, large-scale pre-trained models already possess the ability to resist domain shift. If we reference pre-trained models continuously during fine-tuning to maintain this ability, it could further enhance the generalization ability of the DG model. For this purpose, we introduce a new method called Fine-Tune with Large-scale pre-trained Priors (FT-LP), which incorporates the pre-trained model as a prior into the DG fine-tuning process, ensuring that the model refers to its pre-trained model at each optimization step. FT-LP comprises a theoretical framework and a simple implementation strategy. In theory, we verify the rationality of FT-LP by introducing a generalization error bound with the pre-trained priors for DG. In implementation, we utilize an encoder to simulate the model distribution, enabling the use of FT-LP when only pre-trained weights are available. In summary, we offer a new fine-tuning method for DG algorithms to utilize pre-trained models throughout the fine-tuning process. Through experiments on various datasets and DG models, our proposed method exhibits significant improvements, indicating its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05628v1-abstract-full').style.display = 'none'; document.getElementById('2406.05628v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shen%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shen%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Shen%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository