Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 339 results for author: <span class="mathjax">Yu, R</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yu%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yu, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yu%2C+R&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yu, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yu%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14250">arXiv:2411.14250</a> <span> [<a href="https://arxiv.org/pdf/2411.14250">pdf</a>, <a href="https://arxiv.org/format/2411.14250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CP-UNet: Contour-based Probabilistic Model for Medical Ultrasound Images Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruiguo Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuan Tian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuewei Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jie Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14250v1-abstract-short" style="display: inline;"> Deep learning-based segmentation methods are widely utilized for detecting lesions in ultrasound images. Throughout the imaging procedure, the attenuation and scattering of ultrasound waves cause contour blurring and the formation of artifacts, limiting the clarity of the acquired ultrasound images. To overcome this challenge, we propose a contour-based probabilistic segmentation model CP-UNet, wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14250v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14250v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14250v1-abstract-full" style="display: none;"> Deep learning-based segmentation methods are widely utilized for detecting lesions in ultrasound images. Throughout the imaging procedure, the attenuation and scattering of ultrasound waves cause contour blurring and the formation of artifacts, limiting the clarity of the acquired ultrasound images. To overcome this challenge, we propose a contour-based probabilistic segmentation model CP-UNet, which guides the segmentation network to enhance its focus on contour during decoding. We design a novel down-sampling module to enable the contour probability distribution modeling and encoding stages to acquire global-local features. Furthermore, the Gaussian Mixture Model utilizes optimized features to model the contour distribution, capturing the uncertainty of lesion boundaries. Extensive experiments with several state-of-the-art deep learning segmentation methods on three ultrasound image datasets show that our method performs better on breast and thyroid lesions segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14250v1-abstract-full').style.display = 'none'; document.getElementById('2411.14250v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 4 figures, 2 tables;For icassp2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12973">arXiv:2411.12973</a> <span> [<a href="https://arxiv.org/pdf/2411.12973">pdf</a>, <a href="https://arxiv.org/format/2411.12973">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Process-Guided Learning: An Application in Predicting Lake DO Concentrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Runlong Yu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+C">Chonghao Qiu</a>, <a href="/search/cs?searchtype=author&query=Ladwig%2C+R">Robert Ladwig</a>, <a href="/search/cs?searchtype=author&query=Hanson%2C+P+C">Paul C. Hanson</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yiqun Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanhua Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaowei Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12973v1-abstract-short" style="display: inline;"> This paper introduces a \textit{Process-Guided Learning (Pril)} framework that integrates physical models with recurrent neural networks (RNNs) to enhance the prediction of dissolved oxygen (DO) concentrations in lakes, which is crucial for sustaining water quality and ecosystem health. Unlike traditional RNNs, which may deliver high accuracy but often lack physical consistency and broad applicabi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12973v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12973v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12973v1-abstract-full" style="display: none;"> This paper introduces a \textit{Process-Guided Learning (Pril)} framework that integrates physical models with recurrent neural networks (RNNs) to enhance the prediction of dissolved oxygen (DO) concentrations in lakes, which is crucial for sustaining water quality and ecosystem health. Unlike traditional RNNs, which may deliver high accuracy but often lack physical consistency and broad applicability, the \textit{Pril} method incorporates differential DO equations for each lake layer, modeling it as a first-order linear solution using a forward Euler scheme with a daily timestep. However, this method is sensitive to numerical instabilities. When drastic fluctuations occur, the numerical integration is neither mass-conservative nor stable. Especially during stratified conditions, exogenous fluxes into each layer cause significant within-day changes in DO concentrations. To address this challenge, we further propose an \textit{Adaptive Process-Guided Learning (April)} model, which dynamically adjusts timesteps from daily to sub-daily intervals with the aim of mitigating the discrepancies caused by variations in entrainment fluxes. \textit{April} uses a generator-discriminator architecture to identify days with significant DO fluctuations and employs a multi-step Euler scheme with sub-daily timesteps to effectively manage these variations. We have tested our methods on a wide range of lakes in the Midwestern USA, and demonstrated robust capability in predicting DO concentrations even with limited training data. While primarily focused on aquatic ecosystems, this approach is broadly applicable to diverse scientific and engineering disciplines that utilize process-based models, such as power engineering, climate science, and biomedicine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12973v1-abstract-full').style.display = 'none'; document.getElementById('2411.12973v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10133">arXiv:2411.10133</a> <span> [<a href="https://arxiv.org/pdf/2411.10133">pdf</a>, <a href="https://arxiv.org/format/2411.10133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Density Control for 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xiaobin Deng</a>, <a href="/search/cs?searchtype=author&query=Diao%2C+C">Changyu Diao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Min Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruohan Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Duanqing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10133v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) excels in novel view synthesis, balancing advanced rendering quality with real-time performance. However, in trained scenes, a large number of Gaussians with low opacity significantly increase rendering costs. This issue arises due to flaws in the split and clone operations during the densification process, which lead to extensive Gaussian overlap and subsequent opacit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10133v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10133v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10133v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) excels in novel view synthesis, balancing advanced rendering quality with real-time performance. However, in trained scenes, a large number of Gaussians with low opacity significantly increase rendering costs. This issue arises due to flaws in the split and clone operations during the densification process, which lead to extensive Gaussian overlap and subsequent opacity reduction. To enhance the efficiency of Gaussian utilization, we improve the adaptive density control of 3DGS. First, we introduce a more efficient long-axis split operation to replace the original clone and split, which mitigates Gaussian overlap and improves densification efficiency.Second, we propose a simple adaptive pruning technique to reduce the number of low-opacity Gaussians. Finally, by dynamically lowering the splitting threshold and applying importance weighting, the efficiency of Gaussian utilization is further improved.We evaluate our proposed method on various challenging real-world datasets. Experimental results show that our Efficient Density Control (EDC) can enhance both the rendering speed and quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10133v1-abstract-full').style.display = 'none'; document.getElementById('2411.10133v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06750">arXiv:2411.06750</a> <span> [<a href="https://arxiv.org/pdf/2411.06750">pdf</a>, <a href="https://arxiv.org/format/2411.06750">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SynStitch: a Self-Supervised Learning Network for Ultrasound Image Stitching Using Synthetic Training Pairs and Indirect Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+X">Xing Yao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Runxuan Yu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Dewei Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+A">Ange Lou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+D">Daiwei Lu</a>, <a href="/search/cs?searchtype=author&query=Arenas%2C+G">Gabriel Arenas</a>, <a href="/search/cs?searchtype=author&query=Oguz%2C+B">Baris Oguz</a>, <a href="/search/cs?searchtype=author&query=Pouch%2C+A">Alison Pouch</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+N">Nadav Schwartz</a>, <a href="/search/cs?searchtype=author&query=Byram%2C+B+C">Brett C Byram</a>, <a href="/search/cs?searchtype=author&query=Oguz%2C+I">Ipek Oguz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06750v1-abstract-short" style="display: inline;"> Ultrasound (US) image stitching can expand the field-of-view (FOV) by combining multiple US images from varied probe positions. However, registering US images with only partially overlapping anatomical contents is a challenging task. In this work, we introduce SynStitch, a self-supervised framework designed for 2DUS stitching. SynStitch consists of a synthetic stitching pair generation module (SSP… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06750v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06750v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06750v1-abstract-full" style="display: none;"> Ultrasound (US) image stitching can expand the field-of-view (FOV) by combining multiple US images from varied probe positions. However, registering US images with only partially overlapping anatomical contents is a challenging task. In this work, we introduce SynStitch, a self-supervised framework designed for 2DUS stitching. SynStitch consists of a synthetic stitching pair generation module (SSPGM) and an image stitching module (ISM). SSPGM utilizes a patch-conditioned ControlNet to generate realistic 2DUS stitching pairs with known affine matrix from a single input image. ISM then utilizes this synthetic paired data to learn 2DUS stitching in a supervised manner. Our framework was evaluated against multiple leading methods on a kidney ultrasound dataset, demonstrating superior 2DUS stitching performance through both qualitative and quantitative analyses. The code will be made public upon acceptance of the paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06750v1-abstract-full').style.display = 'none'; document.getElementById('2411.06750v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06735">arXiv:2411.06735</a> <span> [<a href="https://arxiv.org/pdf/2411.06735">pdf</a>, <a href="https://arxiv.org/format/2411.06735">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-Modal Forecaster: Jointly Predicting Time Series and Textual Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kai Kim</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+H">Howard Tsai</a>, <a href="/search/cs?searchtype=author&query=Sen%2C+R">Rajat Sen</a>, <a href="/search/cs?searchtype=author&query=Das%2C+A">Abhimanyu Das</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zihao Zhou</a>, <a href="/search/cs?searchtype=author&query=Tanpure%2C+A">Abhishek Tanpure</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M">Mathew Luo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rose Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06735v2-abstract-short" style="display: inline;"> Current forecasting approaches are largely unimodal and ignore the rich textual data that often accompany the time series due to lack of well-curated multimodal benchmark dataset. In this work, we develop TimeText Corpus (TTC), a carefully curated, time-aligned text and time dataset for multimodal forecasting. Our dataset is composed of sequences of numbers and text aligned to timestamps, and incl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06735v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06735v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06735v2-abstract-full" style="display: none;"> Current forecasting approaches are largely unimodal and ignore the rich textual data that often accompany the time series due to lack of well-curated multimodal benchmark dataset. In this work, we develop TimeText Corpus (TTC), a carefully curated, time-aligned text and time dataset for multimodal forecasting. Our dataset is composed of sequences of numbers and text aligned to timestamps, and includes data from two different domains: climate science and healthcare. Our data is a significant contribution to the rare selection of available multimodal datasets. We also propose the Hybrid Multi-Modal Forecaster (Hybrid-MMF), a multimodal LLM that jointly forecasts both text and time series data using shared embeddings. However, contrary to our expectations, our Hybrid-MMF model does not outperform existing baselines in our experiments. This negative result highlights the challenges inherent in multimodal forecasting. Our code and data are available at https://github.com/Rose-STL-Lab/Multimodal_ Forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06735v2-abstract-full').style.display = 'none'; document.getElementById('2411.06735v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 4 tables, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06193">arXiv:2411.06193</a> <span> [<a href="https://arxiv.org/pdf/2411.06193">pdf</a>, <a href="https://arxiv.org/ps/2411.06193">ps</a>, <a href="https://arxiv.org/format/2411.06193">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models and Artificial Intelligence Generated Content Technologies Meet Communication Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jie Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meiting Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hang Yin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+B">Bin Song</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+Y">Yuhao Chi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">Fei Richard Yu</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06193v2-abstract-short" style="display: inline;"> Artificial intelligence generated content (AIGC) technologies, with a predominance of large language models (LLMs), have demonstrated remarkable performance improvements in various applications, which have attracted great interests from both academia and industry. Although some noteworthy advancements have been made in this area, a comprehensive exploration of the intricate relationship between AI… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06193v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06193v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06193v2-abstract-full" style="display: none;"> Artificial intelligence generated content (AIGC) technologies, with a predominance of large language models (LLMs), have demonstrated remarkable performance improvements in various applications, which have attracted great interests from both academia and industry. Although some noteworthy advancements have been made in this area, a comprehensive exploration of the intricate relationship between AIGC and communication networks remains relatively limited. To address this issue, this paper conducts an exhaustive survey from dual standpoints: firstly, it scrutinizes the integration of LLMs and AIGC technologies within the domain of communication networks; secondly, it investigates how the communication networks can further bolster the capabilities of LLMs and AIGC. Additionally, this research explores the promising applications along with the challenges encountered during the incorporation of these AI technologies into communication networks. Through these detailed analyses, our work aims to deepen the understanding of how LLMs and AIGC can synergize with and enhance the development of advanced intelligent communication networks, contributing to a more profound comprehension of next-generation intelligent communication networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06193v2-abstract-full').style.display = 'none'; document.getElementById('2411.06193v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Internet of Things Journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05331">arXiv:2411.05331</a> <span> [<a href="https://arxiv.org/pdf/2411.05331">pdf</a>, <a href="https://arxiv.org/format/2411.05331">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Discovering Latent Structural Causal Models from Spatio-Temporal Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&query=Varambally%2C+S">Sumanth Varambally</a>, <a href="/search/cs?searchtype=author&query=Watson-Parris%2C+D">Duncan Watson-Parris</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yi-An Ma</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rose Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05331v1-abstract-short" style="display: inline;"> Many important phenomena in scientific fields such as climate, neuroscience, and epidemiology are naturally represented as spatiotemporal gridded data with complex interactions. For example, in climate science, researchers aim to uncover how large-scale events, such as the North Atlantic Oscillation (NAO) and the Antarctic Oscillation (AAO), influence other global processes. Inferring causal relat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05331v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05331v1-abstract-full" style="display: none;"> Many important phenomena in scientific fields such as climate, neuroscience, and epidemiology are naturally represented as spatiotemporal gridded data with complex interactions. For example, in climate science, researchers aim to uncover how large-scale events, such as the North Atlantic Oscillation (NAO) and the Antarctic Oscillation (AAO), influence other global processes. Inferring causal relationships from these data is a challenging problem compounded by the high dimensionality of such data and the correlations between spatially proximate points. We present SPACY (SPAtiotemporal Causal discoverY), a novel framework based on variational inference, designed to explicitly model latent time-series and their causal relationships from spatially confined modes in the data. Our method uses an end-to-end training process that maximizes an evidence-lower bound (ELBO) for the data likelihood. Theoretically, we show that, under some conditions, the latent variables are identifiable up to transformation by an invertible matrix. Empirically, we show that SPACY outperforms state-of-the-art baselines on synthetic data, remains scalable for large grids, and identifies key known phenomena from real-world climate data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05331v1-abstract-full').style.display = 'none'; document.getElementById('2411.05331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01155">arXiv:2411.01155</a> <span> [<a href="https://arxiv.org/pdf/2411.01155">pdf</a>, <a href="https://arxiv.org/format/2411.01155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HG-Adapter: Improving Pre-Trained Heterogeneous Graph Neural Networks with Dual Adapters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mo%2C+Y">Yujie Mo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Runpeng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaofeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01155v1-abstract-short" style="display: inline;"> The "pre-train, prompt-tuning'' paradigm has demonstrated impressive performance for tuning pre-trained heterogeneous graph neural networks (HGNNs) by mitigating the gap between pre-trained models and downstream tasks. However, most prompt-tuning-based works may face at least two limitations: (i) the model may be insufficient to fit the graph structures well as they are generally ignored in the pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01155v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01155v1-abstract-full" style="display: none;"> The "pre-train, prompt-tuning'' paradigm has demonstrated impressive performance for tuning pre-trained heterogeneous graph neural networks (HGNNs) by mitigating the gap between pre-trained models and downstream tasks. However, most prompt-tuning-based works may face at least two limitations: (i) the model may be insufficient to fit the graph structures well as they are generally ignored in the prompt-tuning stage, increasing the training error to decrease the generalization ability; and (ii) the model may suffer from the limited labeled data during the prompt-tuning stage, leading to a large generalization gap between the training error and the test error to further affect the model generalization. To alleviate the above limitations, we first derive the generalization error bound for existing prompt-tuning-based methods, and then propose a unified framework that combines two new adapters with potential labeled data extension to improve the generalization of pre-trained HGNN models. Specifically, we design dual structure-aware adapters to adaptively fit task-related homogeneous and heterogeneous structural information. We further design a label-propagated contrastive loss and two self-supervised losses to optimize dual adapters and incorporate unlabeled nodes as potential labeled data. Theoretical analysis indicates that the proposed method achieves a lower generalization error bound than existing methods, thus obtaining superior generalization ability. Comprehensive experiments demonstrate the effectiveness and generalization of the proposed method on different downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01155v1-abstract-full').style.display = 'none'; document.getElementById('2411.01155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00838">arXiv:2411.00838</a> <span> [<a href="https://arxiv.org/pdf/2411.00838">pdf</a>, <a href="https://arxiv.org/format/2411.00838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Task-Oriented Real-time Visual Inference for IoVT Systems: A Co-design Framework of Neural Networks and Edge Deployment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiaqi Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Simin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zehua Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Z">Zijian Tian</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">F. Richard Yu</a>, <a href="/search/cs?searchtype=author&query=Leung%2C+V+C+M">Victor C. M. Leung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00838v1-abstract-short" style="display: inline;"> As the volume of image data grows, data-oriented cloud computing in Internet of Video Things (IoVT) systems encounters latency issues. Task-oriented edge computing addresses this by shifting data analysis to the edge. However, limited computational power of edge devices poses challenges for executing visual tasks. Existing methods struggle to balance high model performance with low resource consum… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00838v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00838v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00838v1-abstract-full" style="display: none;"> As the volume of image data grows, data-oriented cloud computing in Internet of Video Things (IoVT) systems encounters latency issues. Task-oriented edge computing addresses this by shifting data analysis to the edge. However, limited computational power of edge devices poses challenges for executing visual tasks. Existing methods struggle to balance high model performance with low resource consumption; lightweight neural networks often underperform, while device-specific models designed by Neural Architecture Search (NAS) fail to adapt to heterogeneous devices. For these issues, we propose a novel co-design framework to optimize neural network architecture and deployment strategies during inference for high-throughput. Specifically, it implements a dynamic model structure based on re-parameterization, coupled with a Roofline-based model partitioning strategy to enhance the computational performance of edge devices. We also employ a multi-objective co-optimization approach to balance throughput and accuracy. Additionally, we derive mathematical consistency and convergence of partitioned models. Experimental results demonstrate significant improvements in throughput (12.05\% on MNIST, 18.83\% on ImageNet) and superior classification accuracy compared to baseline algorithms. Our method consistently achieves stable performance across different devices, underscoring its adaptability. Simulated experiments further confirm its efficacy in high-accuracy, real-time detection for small objects in IoVT systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00838v1-abstract-full').style.display = 'none'; document.getElementById('2411.00838v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00826">arXiv:2411.00826</a> <span> [<a href="https://arxiv.org/pdf/2411.00826">pdf</a>, <a href="https://arxiv.org/format/2411.00826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty Quantification via H枚lder Divergence for Multi-View Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+a">an Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chun Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaoxia Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ye Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">Fei Richard Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00826v1-abstract-short" style="display: inline;"> Evidence-based deep learning represents a burgeoning paradigm for uncertainty estimation, offering reliable predictions with negligible extra computational overheads. Existing methods usually adopt Kullback-Leibler divergence to estimate the uncertainty of network predictions, ignoring domain gaps among various modalities. To tackle this issue, this paper introduces a novel algorithm based on H枚ld… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00826v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00826v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00826v1-abstract-full" style="display: none;"> Evidence-based deep learning represents a burgeoning paradigm for uncertainty estimation, offering reliable predictions with negligible extra computational overheads. Existing methods usually adopt Kullback-Leibler divergence to estimate the uncertainty of network predictions, ignoring domain gaps among various modalities. To tackle this issue, this paper introduces a novel algorithm based on H枚lder Divergence (HD) to enhance the reliability of multi-view learning by addressing inherent uncertainty challenges from incomplete or noisy data. Generally, our method extracts the representations of multiple modalities through parallel network branches, and then employs HD to estimate the prediction uncertainties. Through the Dempster-Shafer theory, integration of uncertainty from different modalities, thereby generating a comprehensive result that considers all available representations. Mathematically, HD proves to better measure the ``distance'' between real data distribution and predictive distribution of the model and improve the performances of multi-class recognition tasks. Specifically, our method surpass the existing state-of-the-art counterparts on all evaluating benchmarks. We further conduct extensive experiments on different backbones to verify our superior robustness. It is demonstrated that our method successfully pushes the corresponding performance boundaries. Finally, we perform experiments on more challenging scenarios, \textit{i.e.}, learning with incomplete or noisy data, revealing that our method exhibits a high tolerance to such corrupted data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00826v1-abstract-full').style.display = 'none'; document.getElementById('2411.00826v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00412">arXiv:2411.00412</a> <span> [<a href="https://arxiv.org/pdf/2411.00412">pdf</a>, <a href="https://arxiv.org/format/2411.00412">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Adapting While Learning: Grounding LLMs for Scientific Problems with Intelligent Tool Usage Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+B">Bohan Lyu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yadi Cao</a>, <a href="/search/cs?searchtype=author&query=Watson-Parris%2C+D">Duncan Watson-Parris</a>, <a href="/search/cs?searchtype=author&query=Bergen%2C+L">Leon Bergen</a>, <a href="/search/cs?searchtype=author&query=Berg-Kirkpatrick%2C+T">Taylor Berg-Kirkpatrick</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rose Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00412v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) demonstrate promising capabilities in solving simple scientific problems but often produce hallucinations for complex ones. While integrating LLMs with tools can increase reliability, this approach typically results in over-reliance on tools, diminishing the model's ability to solve simple problems through basic reasoning. In contrast, human experts first assess proble… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00412v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00412v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00412v1-abstract-full" style="display: none;"> Large Language Models (LLMs) demonstrate promising capabilities in solving simple scientific problems but often produce hallucinations for complex ones. While integrating LLMs with tools can increase reliability, this approach typically results in over-reliance on tools, diminishing the model's ability to solve simple problems through basic reasoning. In contrast, human experts first assess problem complexity using domain knowledge before choosing an appropriate solution approach. Inspired by this human problem-solving process, we propose a novel two-component fine-tuning method. In the first component World Knowledge Distillation (WKD), LLMs learn directly from solutions generated using tool's information to internalize domain knowledge. In the second component Tool Usage Adaptation (TUA), we partition problems into easy and hard categories based on the model's direct answering accuracy. While maintaining the same alignment target for easy problems as in WKD, we train the model to intelligently switch to tool usage for more challenging problems. We validate our method on six scientific benchmark datasets, spanning mathematics, climate science and epidemiology. On average, our models demonstrate a 28.18% improvement in answer accuracy and a 13.89% increase in tool usage precision across all datasets, surpassing state-of-the-art models including GPT-4o and Claude-3.5. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00412v1-abstract-full').style.display = 'none'; document.getElementById('2411.00412v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 15 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22282">arXiv:2410.22282</a> <span> [<a href="https://arxiv.org/pdf/2410.22282">pdf</a>, <a href="https://arxiv.org/format/2410.22282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Whose ChatGPT? Unveiling Real-World Educational Inequalities Introduced by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Renzhe Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhen Xu</a>, <a href="/search/cs?searchtype=author&query=CH-Wang%2C+S">Sky CH-Wang</a>, <a href="/search/cs?searchtype=author&query=Arum%2C+R">Richard Arum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22282v2-abstract-short" style="display: inline;"> The universal availability of ChatGPT and other similar tools since late 2022 has prompted tremendous public excitement and experimental effort about the potential of large language models (LLMs) to improve learning experience and outcomes, especially for learners from disadvantaged backgrounds. However, little research has systematically examined the real-world impacts of LLM availability on educ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22282v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22282v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22282v2-abstract-full" style="display: none;"> The universal availability of ChatGPT and other similar tools since late 2022 has prompted tremendous public excitement and experimental effort about the potential of large language models (LLMs) to improve learning experience and outcomes, especially for learners from disadvantaged backgrounds. However, little research has systematically examined the real-world impacts of LLM availability on educational equity beyond theoretical projections and controlled studies of innovative LLM applications. To depict trends of post-LLM inequalities, we analyze 1,140,328 academic writing submissions from 16,791 college students across 2,391 courses between 2021 and 2024 at a public, minority-serving institution in the US. We find that students' overall writing quality gradually increased following the availability of LLMs and that the writing quality gaps between linguistically advantaged and disadvantaged students became increasingly narrower. However, this equitizing effect was more concentrated on students with higher socioeconomic status. These findings shed light on the digital divides in the era of LLMs and raise questions about the equity benefits of LLMs in early stages and highlight the need for researchers and practitioners on developing responsible practices to improve educational equity through LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22282v2-abstract-full').style.display = 'none'; document.getElementById('2410.22282v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16701">arXiv:2410.16701</a> <span> [<a href="https://arxiv.org/pdf/2410.16701">pdf</a>, <a href="https://arxiv.org/format/2410.16701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ClimaQA: An Automated Evaluation Framework for Climate Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Manivannan%2C+V+V">Veeramakali Vignesh Manivannan</a>, <a href="/search/cs?searchtype=author&query=Jafari%2C+Y">Yasaman Jafari</a>, <a href="/search/cs?searchtype=author&query=Eranky%2C+S">Srikar Eranky</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+S">Spencer Ho</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rose Yu</a>, <a href="/search/cs?searchtype=author&query=Watson-Parris%2C+D">Duncan Watson-Parris</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yian Ma</a>, <a href="/search/cs?searchtype=author&query=Bergen%2C+L">Leon Bergen</a>, <a href="/search/cs?searchtype=author&query=Berg-Kirkpatrick%2C+T">Taylor Berg-Kirkpatrick</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16701v1-abstract-short" style="display: inline;"> The use of foundation models in climate science has recently gained significant attention. However, a critical issue remains: the lack of a comprehensive evaluation framework capable of assessing the quality and scientific validity of model outputs. To address this issue, we develop ClimaGen (Climate QA Generator), an automated algorithmic framework that generates question-answer pairs from gradua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16701v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16701v1-abstract-full" style="display: none;"> The use of foundation models in climate science has recently gained significant attention. However, a critical issue remains: the lack of a comprehensive evaluation framework capable of assessing the quality and scientific validity of model outputs. To address this issue, we develop ClimaGen (Climate QA Generator), an automated algorithmic framework that generates question-answer pairs from graduate textbooks with climate scientists in the loop. As a result, we present ClimaQA-Gold, an expert-annotated benchmark dataset alongside ClimaQA-Silver, a large-scale, comprehensive synthetic QA dataset for climate science. Finally, we develop evaluation strategies and compare different Large Language Models (LLMs) on our benchmarks. Our results offer novel insights into various approaches used to enhance climate foundation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16701v1-abstract-full').style.display = 'none'; document.getElementById('2410.16701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11761">arXiv:2410.11761</a> <span> [<a href="https://arxiv.org/pdf/2410.11761">pdf</a>, <a href="https://arxiv.org/format/2410.11761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SlideChat: A Large Vision-Language Assistant for Whole-Slide Pathology Image Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yuanfeng Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanjun Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bin Zhang</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+N">Nana Pei</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rongshan Yu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11761v2-abstract-short" style="display: inline;"> Despite the progress made by multimodal large language models (MLLMs) in computational pathology, they remain limited by a predominant focus on patch-level analysis, missing essential contextual information at the whole-slide level. The lack of large-scale instruction datasets and the gigapixel scale of whole slide images (WSIs) pose significant developmental challenges. In this paper, we present… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11761v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11761v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11761v2-abstract-full" style="display: none;"> Despite the progress made by multimodal large language models (MLLMs) in computational pathology, they remain limited by a predominant focus on patch-level analysis, missing essential contextual information at the whole-slide level. The lack of large-scale instruction datasets and the gigapixel scale of whole slide images (WSIs) pose significant developmental challenges. In this paper, we present SlideChat, the first vision-language assistant capable of understanding gigapixel whole-slide images, exhibiting excellent multimodal conversational capability and response complex instruction across diverse pathology scenarios. To support its development, we created SlideInstruction, the largest instruction-following dataset for WSIs consisting of 4.2K WSI captions and 176K VQA pairs with multiple categories. Furthermore, we propose SlideBench, a multimodal benchmark that incorporates captioning and VQA tasks to assess SlideChat's capabilities in varied clinical settings such as microscopy, diagnosis. Compared to both general and specialized MLLMs, SlideChat exhibits exceptional capabilities achieving state-of-the-art performance on 18 of 22 tasks. For example, it achieved an overall accuracy of 81.17% on SlideBench-VQA (TCGA), and 54.15% on SlideBench-VQA (BCNB). We will fully release SlideChat, SlideInstruction and SlideBench as open-source resources to facilitate research and development in computational pathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11761v2-abstract-full').style.display = 'none'; document.getElementById('2410.11761v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11474">arXiv:2410.11474</a> <span> [<a href="https://arxiv.org/pdf/2410.11474">pdf</a>, <a href="https://arxiv.org/format/2410.11474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> How Transformers Implement Induction Heads: Approximation and Optimization Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingze Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruoxi Yu</a>, <a href="/search/cs?searchtype=author&query=E%2C+W">Weinan E</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11474v2-abstract-short" style="display: inline;"> Transformers have demonstrated exceptional in-context learning capabilities, yet the theoretical understanding of the underlying mechanisms remain limited. A recent work (Elhage et al., 2021) identified a "rich" in-context mechanism known as induction head, contrasting with "lazy" $n$-gram models that overlook long-range dependencies. In this work, we provide both approximation and optimization an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11474v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11474v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11474v2-abstract-full" style="display: none;"> Transformers have demonstrated exceptional in-context learning capabilities, yet the theoretical understanding of the underlying mechanisms remain limited. A recent work (Elhage et al., 2021) identified a "rich" in-context mechanism known as induction head, contrasting with "lazy" $n$-gram models that overlook long-range dependencies. In this work, we provide both approximation and optimization analyses of how transformers implement induction heads. In the approximation analysis, we formalize both standard and generalized induction head mechanisms, and examine how transformers can efficiently implement them, with an emphasis on the distinct role of each transformer submodule. For the optimization analysis, we study the training dynamics on a synthetic mixed target, composed of a 4-gram and an in-context 2-gram component. This setting enables us to precisely characterize the entire training process and uncover an {\em abrupt transition} from lazy (4-gram) to rich (induction head) mechanisms as training progresses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11474v2-abstract-full').style.display = 'none'; document.getElementById('2410.11474v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">39 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11226">arXiv:2410.11226</a> <span> [<a href="https://arxiv.org/pdf/2410.11226">pdf</a>, <a href="https://arxiv.org/format/2410.11226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> MF-LAL: Drug Compound Generation Using Multi-Fidelity Latent Space Active Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Eckmann%2C+P">Peter Eckmann</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Dongxia Wu</a>, <a href="/search/cs?searchtype=author&query=Heinzelmann%2C+G">Germano Heinzelmann</a>, <a href="/search/cs?searchtype=author&query=Gilson%2C+M+K">Michael K Gilson</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rose Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11226v1-abstract-short" style="display: inline;"> Current generative models for drug discovery primarily use molecular docking as an oracle to guide the generation of active compounds. However, such models are often not useful in practice because even compounds with high docking scores do not consistently show experimental activity. More accurate methods for activity prediction exist, such as molecular dynamics based binding free energy calculati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11226v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11226v1-abstract-full" style="display: none;"> Current generative models for drug discovery primarily use molecular docking as an oracle to guide the generation of active compounds. However, such models are often not useful in practice because even compounds with high docking scores do not consistently show experimental activity. More accurate methods for activity prediction exist, such as molecular dynamics based binding free energy calculations, but they are too computationally expensive to use in a generative model. To address this challenge, we propose Multi-Fidelity Latent space Active Learning (MF-LAL), a generative modeling framework that integrates a set of oracles with varying cost-accuracy tradeoffs. Unlike previous approaches that separately learn the surrogate model and generative model, MF-LAL combines the generative and multi-fidelity surrogate models into a single framework, allowing for more accurate activity prediction and higher quality samples. We train MF-LAL with a novel active learning algorithm to further reduce computational cost. Our experiments on two disease-relevant proteins show that MF-LAL produces compounds with significantly better binding free energy scores than other single and multi-fidelity approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11226v1-abstract-full').style.display = 'none'; document.getElementById('2410.11226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures. arXiv admin note: text overlap with arXiv:2402.10387</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07579">arXiv:2410.07579</a> <span> [<a href="https://arxiv.org/pdf/2410.07579">pdf</a>, <a href="https://arxiv.org/format/2410.07579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Teddy: Efficient Large-Scale Dataset Distillation via Taylor-Approximated Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruonan Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songhua Liu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jingwen Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07579v1-abstract-short" style="display: inline;"> Dataset distillation or condensation refers to compressing a large-scale dataset into a much smaller one, enabling models trained on this synthetic dataset to generalize effectively on real data. Tackling this challenge, as defined, relies on a bi-level optimization algorithm: a novel model is trained in each iteration within a nested loop, with gradients propagated through an unrolled computation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07579v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07579v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07579v1-abstract-full" style="display: none;"> Dataset distillation or condensation refers to compressing a large-scale dataset into a much smaller one, enabling models trained on this synthetic dataset to generalize effectively on real data. Tackling this challenge, as defined, relies on a bi-level optimization algorithm: a novel model is trained in each iteration within a nested loop, with gradients propagated through an unrolled computation graph. However, this approach incurs high memory and time complexity, posing difficulties in scaling up to large datasets such as ImageNet. Addressing these concerns, this paper introduces Teddy, a Taylor-approximated dataset distillation framework designed to handle large-scale dataset and enhance efficiency. On the one hand, backed up by theoretical analysis, we propose a memory-efficient approximation derived from Taylor expansion, which transforms the original form dependent on multi-step gradients to a first-order one. On the other hand, rather than repeatedly training a novel model in each iteration, we unveil that employing a pre-cached pool of weak models, which can be generated from a single base model, enhances both time efficiency and performance concurrently, particularly when dealing with large-scale datasets. Extensive experiments demonstrate that the proposed Teddy attains state-of-the-art efficiency and performance on the Tiny-ImageNet and original-sized ImageNet-1K dataset, notably surpassing prior methods by up to 12.8%, while reducing 46.6% runtime. Our code will be available at https://github.com/Lexie-YU/Teddy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07579v1-abstract-full').style.display = 'none'; document.getElementById('2410.07579v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06431">arXiv:2410.06431</a> <span> [<a href="https://arxiv.org/pdf/2410.06431">pdf</a>, <a href="https://arxiv.org/format/2410.06431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Functional-level Uncertainty Quantification for Calibrated Fine-tuning on LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+R">Ruijia Niu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Dongxia Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rose Yu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yi-An Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06431v1-abstract-short" style="display: inline;"> From common-sense reasoning to domain-specific tasks, parameter-efficient fine tuning (PEFT) methods for large language models (LLMs) have showcased significant performance improvements on downstream tasks. However, fine-tuned LLMs often struggle with overconfidence in uncertain predictions, particularly due to sparse training data. This overconfidence reflects poor epistemic uncertainty calibrati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06431v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06431v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06431v1-abstract-full" style="display: none;"> From common-sense reasoning to domain-specific tasks, parameter-efficient fine tuning (PEFT) methods for large language models (LLMs) have showcased significant performance improvements on downstream tasks. However, fine-tuned LLMs often struggle with overconfidence in uncertain predictions, particularly due to sparse training data. This overconfidence reflects poor epistemic uncertainty calibration, which arises from limitations in the model's ability to generalize with limited data. Existing PEFT uncertainty quantification methods for LLMs focus on the post fine-tuning stage and thus have limited capability in calibrating epistemic uncertainty. To address these limitations, we propose Functional-Level Uncertainty Quantification for Calibrated Fine-Tuning (UQ4CT), which captures and calibrates functional-level epistemic uncertainty during the fine-tuning stage via a mixture-of-expert framework. We show that UQ4CT reduces Expected Calibration Error (ECE) by more than $25\%$ while maintaining high accuracy across $5$ benchmarks. Furthermore, UQ4CT maintains superior ECE performance with high accuracy under distribution shift, showcasing improved generalizability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06431v1-abstract-full').style.display = 'none'; document.getElementById('2410.06431v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05500">arXiv:2410.05500</a> <span> [<a href="https://arxiv.org/pdf/2410.05500">pdf</a>, <a href="https://arxiv.org/format/2410.05500">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Residual Kolmogorov-Arnold Network for Enhanced Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R+C">Ray Congrui Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sherry Wu</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiang Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05500v1-abstract-short" style="display: inline;"> Despite the strong performance in many computer vision tasks, Convolutional Neural Networks (CNNs) can sometimes struggle to efficiently capture long-range, complex non-linear dependencies in deeper layers of the network. We address this limitation by introducing Residual KAN, which incorporates the Kolmogorov-Arnold Network (KAN) within the CNN framework as a residual component. Our approach uses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05500v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05500v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05500v1-abstract-full" style="display: none;"> Despite the strong performance in many computer vision tasks, Convolutional Neural Networks (CNNs) can sometimes struggle to efficiently capture long-range, complex non-linear dependencies in deeper layers of the network. We address this limitation by introducing Residual KAN, which incorporates the Kolmogorov-Arnold Network (KAN) within the CNN framework as a residual component. Our approach uses Chebyshev polynomials as the basis for KAN convolutions that enables more expressive and adaptive feature representations while maintaining computational efficiency. The proposed RKAN blocks, when integrated into established architectures such as ResNet and DenseNet, offer consistent improvements over the baseline models on various well-known benchmarks. Our results demonstrate the potential of RKAN to enhance the capabilities of deep CNNs in visual data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05500v1-abstract-full').style.display = 'none'; document.getElementById('2410.05500v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/withray/residualKAN.git</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05440">arXiv:2410.05440</a> <span> [<a href="https://arxiv.org/pdf/2410.05440">pdf</a>, <a href="https://arxiv.org/format/2410.05440">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Can LLMs Understand Time Series Anomalies? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zihao Zhou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rose Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05440v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have gained popularity in time series forecasting, but their potential for anomaly detection remains largely unexplored. Our study investigates whether LLMs can understand and detect anomalies in time series data, focusing on zero-shot and few-shot scenarios. Inspired by conjectures about LLMs' behavior from time series forecasting research, we formulate key hypotheses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05440v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05440v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05440v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have gained popularity in time series forecasting, but their potential for anomaly detection remains largely unexplored. Our study investigates whether LLMs can understand and detect anomalies in time series data, focusing on zero-shot and few-shot scenarios. Inspired by conjectures about LLMs' behavior from time series forecasting research, we formulate key hypotheses about LLMs' capabilities in time series anomaly detection. We design and conduct principled experiments to test each of these hypotheses. Our investigation reveals several surprising findings about LLMs for time series: 1. LLMs understand time series better as images rather than as text 2. LLMs did not demonstrate enhanced performance when prompted to engage in explicit reasoning about time series analysis 3. Contrary to common beliefs, LLM's understanding of time series do not stem from their repetition biases or arithmetic abilities 4. LLMs' behaviors and performance in time series analysis vary significantly across different model architectures This study provides the first comprehensive analysis of contemporary LLM capabilities in time series anomaly detection. Our results suggest that while LLMs can understand time series anomalies, many common conjectures based on their reasoning capabilities do not hold. Our code and data are available at `https://github.com/Rose-STL-Lab/AnomLLM/`. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05440v2-abstract-full').style.display = 'none'; document.getElementById('2410.05440v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04005">arXiv:2410.04005</a> <span> [<a href="https://arxiv.org/pdf/2410.04005">pdf</a>, <a href="https://arxiv.org/format/2410.04005">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Enhancing the Travel Experience for People with Visual Impairments through Multimodal Interaction: NaviGPT, A Real-Time AI-Driven Mobile Navigation System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Falletta%2C+N+J">Nicholas J. Falletta</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jingyi Xie</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rui Yu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sooyeon Lee</a>, <a href="/search/cs?searchtype=author&query=Billah%2C+S+M">Syed Masum Billah</a>, <a href="/search/cs?searchtype=author&query=Carroll%2C+J+M">John M. Carroll</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04005v1-abstract-short" style="display: inline;"> Assistive technologies for people with visual impairments (PVI) have made significant advancements, particularly with the integration of artificial intelligence (AI) and real-time sensor technologies. However, current solutions often require PVI to switch between multiple apps and tools for tasks like image recognition, navigation, and obstacle detection, which can hinder a seamless and efficient… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04005v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04005v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04005v1-abstract-full" style="display: none;"> Assistive technologies for people with visual impairments (PVI) have made significant advancements, particularly with the integration of artificial intelligence (AI) and real-time sensor technologies. However, current solutions often require PVI to switch between multiple apps and tools for tasks like image recognition, navigation, and obstacle detection, which can hinder a seamless and efficient user experience. In this paper, we present NaviGPT, a high-fidelity prototype that integrates LiDAR-based obstacle detection, vibration feedback, and large language model (LLM) responses to provide a comprehensive and real-time navigation aid for PVI. Unlike existing applications such as Be My AI and Seeing AI, NaviGPT combines image recognition and contextual navigation guidance into a single system, offering continuous feedback on the user's surroundings without the need for app-switching. Meanwhile, NaviGPT compensates for the response delays of LLM by using location and sensor data, aiming to provide practical and efficient navigation support for PVI in dynamic environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04005v1-abstract-full').style.display = 'none'; document.getElementById('2410.04005v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 3 figures, this work has been accepted by the 2025 ACM International Conference on Supporting Group Work (GROUP '25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01011">arXiv:2410.01011</a> <span> [<a href="https://arxiv.org/pdf/2410.01011">pdf</a>, <a href="https://arxiv.org/format/2410.01011">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Back to Bayesics: Uncovering Human Mobility Distributions and Anomalies with an Integrated Statistical and Neural Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+M">Minxuan Duan</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Y">Yinlong Qian</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lingyi Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zihao Zhou</a>, <a href="/search/cs?searchtype=author&query=Rasheed%2C+Z">Zeeshan Rasheed</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rose Yu</a>, <a href="/search/cs?searchtype=author&query=Shafique%2C+K">Khurram Shafique</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01011v2-abstract-short" style="display: inline;"> Existing methods for anomaly detection often fall short due to their inability to handle the complexity, heterogeneity, and high dimensionality inherent in real-world mobility data. In this paper, we propose DeepBayesic, a novel framework that integrates Bayesian principles with deep neural networks to model the underlying multivariate distributions from sparse and complex datasets. Unlike traditi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01011v2-abstract-full').style.display = 'inline'; document.getElementById('2410.01011v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01011v2-abstract-full" style="display: none;"> Existing methods for anomaly detection often fall short due to their inability to handle the complexity, heterogeneity, and high dimensionality inherent in real-world mobility data. In this paper, we propose DeepBayesic, a novel framework that integrates Bayesian principles with deep neural networks to model the underlying multivariate distributions from sparse and complex datasets. Unlike traditional models, DeepBayesic is designed to manage heterogeneous inputs, accommodating both continuous and categorical data to provide a more comprehensive understanding of mobility patterns. The framework features customized neural density estimators and hybrid architectures, allowing for flexibility in modeling diverse feature distributions and enabling the use of specialized neural networks tailored to different data types. Our approach also leverages agent embeddings for personalized anomaly detection, enhancing its ability to distinguish between normal and anomalous behaviors for individual agents. We evaluate our approach on several mobility datasets, demonstrating significant improvements over state-of-the-art anomaly detection methods. Our results indicate that incorporating personalization and advanced sequence modeling techniques can substantially enhance the ability to detect subtle and complex anomalies in spatiotemporal event sequences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01011v2-abstract-full').style.display = 'none'; document.getElementById('2410.01011v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19550">arXiv:2409.19550</a> <span> [<a href="https://arxiv.org/pdf/2409.19550">pdf</a>, <a href="https://arxiv.org/ps/2409.19550">ps</a>, <a href="https://arxiv.org/format/2409.19550">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Tailed Low-Rank Matrix Factorization for Similarity Matrix Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+C">Changyi Ma</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Runsheng Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youzhi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19550v1-abstract-short" style="display: inline;"> Similarity matrix serves as a fundamental tool at the core of numerous downstream machine-learning tasks. However, missing data is inevitable and often results in an inaccurate similarity matrix. To address this issue, Similarity Matrix Completion (SMC) methods have been proposed, but they suffer from high computation complexity due to the Singular Value Decomposition (SVD) operation. To reduce th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19550v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19550v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19550v1-abstract-full" style="display: none;"> Similarity matrix serves as a fundamental tool at the core of numerous downstream machine-learning tasks. However, missing data is inevitable and often results in an inaccurate similarity matrix. To address this issue, Similarity Matrix Completion (SMC) methods have been proposed, but they suffer from high computation complexity due to the Singular Value Decomposition (SVD) operation. To reduce the computation complexity, Matrix Factorization (MF) techniques are more explicit and frequently applied to provide a low-rank solution, but the exact low-rank optimal solution can not be guaranteed since it suffers from a non-convex structure. In this paper, we introduce a novel SMC framework that offers a more reliable and efficient solution. Specifically, beyond simply utilizing the unique Positive Semi-definiteness (PSD) property to guide the completion process, our approach further complements a carefully designed rank-minimization regularizer, aiming to achieve an optimal and low-rank solution. Based on the key insights that the underlying PSD property and Low-Rank property improve the SMC performance, we present two novel, scalable, and effective algorithms, SMCNN and SMCNmF, which investigate the PSD property to guide the estimation process and incorporate nonconvex low-rank regularizer to ensure the low-rank solution. Theoretical analysis ensures better estimation performance and convergence speed. Empirical results on real-world datasets demonstrate the superiority and efficiency of our proposed methods compared to various baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19550v1-abstract-full').style.display = 'none'; document.getElementById('2409.19550v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17143">arXiv:2409.17143</a> <span> [<a href="https://arxiv.org/pdf/2409.17143">pdf</a>, <a href="https://arxiv.org/format/2409.17143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Attention Prompting on Image for Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Runpeng Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weihao Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17143v1-abstract-short" style="display: inline;"> Compared with Large Language Models (LLMs), Large Vision-Language Models (LVLMs) can also accept images as input, thus showcasing more interesting emergent capabilities and demonstrating impressive performance on various vision-language tasks. Motivated by text prompting in LLMs, visual prompting has been explored to enhance LVLMs' capabilities of perceiving visual information. However, previous v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17143v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17143v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17143v1-abstract-full" style="display: none;"> Compared with Large Language Models (LLMs), Large Vision-Language Models (LVLMs) can also accept images as input, thus showcasing more interesting emergent capabilities and demonstrating impressive performance on various vision-language tasks. Motivated by text prompting in LLMs, visual prompting has been explored to enhance LVLMs' capabilities of perceiving visual information. However, previous visual prompting techniques solely process visual inputs without considering text queries, limiting the models' ability to follow text instructions to complete tasks. To fill this gap, in this work, we propose a new prompting technique named Attention Prompting on Image, which just simply overlays a text-query-guided attention heatmap on the original input image and effectively enhances LVLM on various tasks. Specifically, we generate an attention heatmap for the input image dependent on the text query with an auxiliary model like CLIP. Then the heatmap simply multiplies the pixel values of the original image to obtain the actual input image for the LVLM. Extensive experiments on various vison-language benchmarks verify the effectiveness of our technique. For example, Attention Prompting on Image improves LLaVA-1.5 by 3.8% and 2.9% on MM-Vet and LLaVA-Wild benchmarks, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17143v1-abstract-full').style.display = 'none'; document.getElementById('2409.17143v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website, see https://yu-rp.github.io/api-prompting</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15692">arXiv:2409.15692</a> <span> [<a href="https://arxiv.org/pdf/2409.15692">pdf</a>, <a href="https://arxiv.org/format/2409.15692">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Walking with Terrain Reconstruction: Learning to Traverse Risky Sparse Footholds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruiqi Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianshi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jun Wu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiuguo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15692v1-abstract-short" style="display: inline;"> Traversing risky terrains with sparse footholds presents significant challenges for legged robots, requiring precise foot placement in safe areas. Current learning-based methods often rely on implicit feature representations without supervising physically significant estimation targets. This limits the policy's ability to fully understand complex terrain structures, which is critical for generatin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15692v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15692v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15692v1-abstract-full" style="display: none;"> Traversing risky terrains with sparse footholds presents significant challenges for legged robots, requiring precise foot placement in safe areas. Current learning-based methods often rely on implicit feature representations without supervising physically significant estimation targets. This limits the policy's ability to fully understand complex terrain structures, which is critical for generating accurate actions. In this paper, we utilize end-to-end reinforcement learning to traverse risky terrains with high sparsity and randomness. Our approach integrates proprioception with single-view depth images to reconstruct robot's local terrain, enabling a more comprehensive representation of terrain information. Meanwhile, by incorporating implicit and explicit estimations of the robot's state and its surroundings, we improve policy's environmental understanding, leading to more precise actions. We deploy the proposed framework on a low-cost quadrupedal robot, achieving agile and adaptive locomotion across various challenging terrains and demonstrating outstanding performance in real-world scenarios. Video at: http://youtu.be/ReQAR4D6tuc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15692v1-abstract-full').style.display = 'none'; document.getElementById('2409.15692v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11018">arXiv:2409.11018</a> <span> [<a href="https://arxiv.org/pdf/2409.11018">pdf</a>, <a href="https://arxiv.org/format/2409.11018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Potential of Mamba: Boosting a LiDAR 3D Sparse Detector by Using Cross-Model Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rui Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Runkai Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiagen Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qingsong Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Songhao Zhu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">HuaiCheng Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11018v1-abstract-short" style="display: inline;"> The LiDAR-based 3D object detector that strikes a balance between accuracy and speed is crucial for achieving real-time perception in autonomous driving and robotic navigation systems. To enhance the accuracy of point cloud detection, integrating global context for visual understanding improves the point clouds ability to grasp overall spatial information. However, many existing LiDAR detection mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11018v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11018v1-abstract-full" style="display: none;"> The LiDAR-based 3D object detector that strikes a balance between accuracy and speed is crucial for achieving real-time perception in autonomous driving and robotic navigation systems. To enhance the accuracy of point cloud detection, integrating global context for visual understanding improves the point clouds ability to grasp overall spatial information. However, many existing LiDAR detection models depend on intricate feature transformation and extraction processes, leading to poor real-time performance and high resource consumption, which limits their practical effectiveness. In this work, we propose a Faster LiDAR 3D object detection framework, called FASD, which implements heterogeneous model distillation by adaptively uniform cross-model voxel features. We aim to distill the transformer's capacity for high-performance sequence modeling into Mamba models with low FLOPs, achieving a significant improvement in accuracy through knowledge transfer. Specifically, Dynamic Voxel Group and Adaptive Attention strategies are integrated into the sparse backbone, creating a robust teacher model with scale-adaptive attention for effective global visual context modeling. Following feature alignment with the Adapter, we transfer knowledge from the Transformer to the Mamba through latent space feature supervision and span-head distillation, resulting in improved performance and an efficient student model. We evaluated the framework on the Waymo and nuScenes datasets, achieving a 4x reduction in resource consumption and a 1-2\% performance improvement over the current SoTA methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11018v1-abstract-full').style.display = 'none'; document.getElementById('2409.11018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09585">arXiv:2409.09585</a> <span> [<a href="https://arxiv.org/pdf/2409.09585">pdf</a>, <a href="https://arxiv.org/format/2409.09585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> CSQF-based Time-Sensitive Flow Scheduling in Long-distance Industrial IoT Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yudong Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tao Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">Fei Richard Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09585v1-abstract-short" style="display: inline;"> Booming time-critical services, such as automated manufacturing and remote operations, stipulate increasing demands for facilitating large-scale Industrial Internet of Things (IoT). Recently, a cycle specified queuing and forwarding (CSQF) scheme has been advocated to enhance the Ethernet. However, CSQF only outlines a foundational equipment-level primitive, while how to attain network-wide flow s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09585v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09585v1-abstract-full" style="display: none;"> Booming time-critical services, such as automated manufacturing and remote operations, stipulate increasing demands for facilitating large-scale Industrial Internet of Things (IoT). Recently, a cycle specified queuing and forwarding (CSQF) scheme has been advocated to enhance the Ethernet. However, CSQF only outlines a foundational equipment-level primitive, while how to attain network-wide flow scheduling is not yet determined. Prior endeavors primarily focus on the range of a local area, rendering them unsuitable for long-distance factory interconnection. This paper devises the cycle tags planning (CTP) mechanism, the first integer programming model for the CSQF, which makes the CSQF practical for efficient global flow scheduling. In the CTP model, the per-hop cycle alignment problem is solved by decoupling the long-distance link delay from cyclic queuing time. To avoid queue overflows, we discretize the underlying network resources into cycle-related queue resource blocks and detail the core constraints within multiple periods. Then, two heuristic algorithms named flow offset and cycle shift (FO-CS) and Tabu FO-CS are designed to calculate the flows' cycle tags and maximize the number of schedulable flows, respectively. Evaluation results show that FO-CS increases the number of scheduled flows by 31.2%. The Tabu FO-CS algorithm can schedule 94.45% of flows at the level of 2000 flows. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09585v1-abstract-full').style.display = 'none'; document.getElementById('2409.09585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06277">arXiv:2409.06277</a> <span> [<a href="https://arxiv.org/pdf/2409.06277">pdf</a>, <a href="https://arxiv.org/format/2409.06277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Ferret: Federated Full-Parameter Tuning at Scale for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+Y">Yao Shu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenyang Hu</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+S">See-Kiong Ng</a>, <a href="/search/cs?searchtype=author&query=Low%2C+B+K+H">Bryan Kian Hsiang Low</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">Fei Richard Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06277v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have become indispensable in numerous real-world applications. Unfortunately, fine-tuning these models at scale, especially in federated settings where data privacy and communication efficiency are critical, presents significant challenges. Existing methods often resort to parameter-efficient fine-tuning (PEFT) to mitigate communication overhead, but this typically com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06277v2-abstract-full').style.display = 'inline'; document.getElementById('2409.06277v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06277v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have become indispensable in numerous real-world applications. Unfortunately, fine-tuning these models at scale, especially in federated settings where data privacy and communication efficiency are critical, presents significant challenges. Existing methods often resort to parameter-efficient fine-tuning (PEFT) to mitigate communication overhead, but this typically comes at the cost of model accuracy. To address these limitations, we propose federated full-parameter tuning at scale for LLMs (Ferret), the first first-order method with shared randomness to enable scalable full-parameter tuning of LLMs across decentralized data sources while maintaining competitive model accuracy. Ferret accomplishes this through three aspects: (1) it employs widely applied first-order methods for efficient local updates; (2) it projects these updates into a low-dimensional space to considerably reduce communication overhead; and (3) it reconstructs local updates from this low-dimensional space with shared randomness to facilitate effective full-parameter global aggregation, ensuring fast convergence and competitive final performance. Our rigorous theoretical analyses and insights along with extensive experiments, show that Ferret significantly enhances the scalability of existing federated full-parameter tuning approaches by achieving high computational efficiency, reduced communication overhead, and fast convergence, all while maintaining competitive model accuracy. Our implementation is available at https://github.com/allen4747/Ferret. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06277v2-abstract-full').style.display = 'none'; document.getElementById('2409.06277v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04390">arXiv:2409.04390</a> <span> [<a href="https://arxiv.org/pdf/2409.04390">pdf</a>, <a href="https://arxiv.org/format/2409.04390">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Future Does Matter: Boosting 3D Object Detection with Temporal Motion Estimation in Point Cloud Sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rui Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Runkai Zhao</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+C">Cong Nie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">HuaiCheng Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04390v1-abstract-short" style="display: inline;"> Accurate and robust LiDAR 3D object detection is essential for comprehensive scene understanding in autonomous driving. Despite its importance, LiDAR detection performance is limited by inherent constraints of point cloud data, particularly under conditions of extended distances and occlusions. Recently, temporal aggregation has been proven to significantly enhance detection accuracy by fusing mul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04390v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04390v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04390v1-abstract-full" style="display: none;"> Accurate and robust LiDAR 3D object detection is essential for comprehensive scene understanding in autonomous driving. Despite its importance, LiDAR detection performance is limited by inherent constraints of point cloud data, particularly under conditions of extended distances and occlusions. Recently, temporal aggregation has been proven to significantly enhance detection accuracy by fusing multi-frame viewpoint information and enriching the spatial representation of objects. In this work, we introduce a novel LiDAR 3D object detection framework, namely LiSTM, to facilitate spatial-temporal feature learning with cross-frame motion forecasting information. We aim to improve the spatial-temporal interpretation capabilities of the LiDAR detector by incorporating a dynamic prior, generated from a non-learnable motion estimation model. Specifically, Motion-Guided Feature Aggregation (MGFA) is proposed to utilize the object trajectory from previous and future motion states to model spatial-temporal correlations into gaussian heatmap over a driving sequence. This motion-based heatmap then guides the temporal feature fusion, enriching the proposed object features. Moreover, we design a Dual Correlation Weighting Module (DCWM) that effectively facilitates the interaction between past and prospective frames through scene- and channel-wise feature abstraction. In the end, a cascade cross-attention-based decoder is employed to refine the 3D prediction. We have conducted experiments on the Waymo and nuScenes datasets to demonstrate that the proposed framework achieves superior 3D detection performance with effective spatial-temporal feature learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04390v1-abstract-full').style.display = 'none'; document.getElementById('2409.04390v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01581">arXiv:2409.01581</a> <span> [<a href="https://arxiv.org/pdf/2409.01581">pdf</a>, <a href="https://arxiv.org/format/2409.01581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GaussianPU: A Hybrid 2D-3D Upsampling Framework for Enhancing Color Point Clouds via 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zixuan Guo</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yifan Xie</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Weijing Xie</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Peng Huang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fei Ma</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">Fei Richard Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01581v1-abstract-short" style="display: inline;"> Dense colored point clouds enhance visual perception and are of significant value in various robotic applications. However, existing learning-based point cloud upsampling methods are constrained by computational resources and batch processing strategies, which often require subdividing point clouds into smaller patches, leading to distortions that degrade perceptual quality. To address this challe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01581v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01581v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01581v1-abstract-full" style="display: none;"> Dense colored point clouds enhance visual perception and are of significant value in various robotic applications. However, existing learning-based point cloud upsampling methods are constrained by computational resources and batch processing strategies, which often require subdividing point clouds into smaller patches, leading to distortions that degrade perceptual quality. To address this challenge, we propose a novel 2D-3D hybrid colored point cloud upsampling framework (GaussianPU) based on 3D Gaussian Splatting (3DGS) for robotic perception. This approach leverages 3DGS to bridge 3D point clouds with their 2D rendered images in robot vision systems. A dual scale rendered image restoration network transforms sparse point cloud renderings into dense representations, which are then input into 3DGS along with precise robot camera poses and interpolated sparse point clouds to reconstruct dense 3D point clouds. We have made a series of enhancements to the vanilla 3DGS, enabling precise control over the number of points and significantly boosting the quality of the upsampled point cloud for robotic scene understanding. Our framework supports processing entire point clouds on a single consumer-grade GPU, such as the NVIDIA GeForce RTX 3090, eliminating the need for segmentation and thus producing high-quality, dense colored point clouds with millions of points for robot navigation and manipulation tasks. Extensive experimental results on generating million-level point cloud data validate the effectiveness of our method, substantially improving the quality of colored point clouds and demonstrating significant potential for applications involving large-scale point clouds in autonomous robotics and human-robot interaction scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01581v1-abstract-full').style.display = 'none'; document.getElementById('2409.01581v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00897">arXiv:2409.00897</a> <span> [<a href="https://arxiv.org/pdf/2409.00897">pdf</a>, <a href="https://arxiv.org/format/2409.00897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Infiltrating the Sky: Data Delay and Overflow Attacks in Earth Observation Constellations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaojian Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruozhou Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejun Yang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+G">Guoliang Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00897v2-abstract-short" style="display: inline;"> Low Earth Orbit (LEO) Earth Observation (EO) satellites have changed the way we monitor Earth. Acting like moving cameras, EO satellites are formed in constellations with different missions and priorities, and capture vast data that needs to be transmitted to the ground for processing. However, EO satellites have very limited downlink communication capability, limited by transmission bandwidth, nu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00897v2-abstract-full').style.display = 'inline'; document.getElementById('2409.00897v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00897v2-abstract-full" style="display: none;"> Low Earth Orbit (LEO) Earth Observation (EO) satellites have changed the way we monitor Earth. Acting like moving cameras, EO satellites are formed in constellations with different missions and priorities, and capture vast data that needs to be transmitted to the ground for processing. However, EO satellites have very limited downlink communication capability, limited by transmission bandwidth, number and location of ground stations, and small transmission windows due to high velocity satellite movement. To optimize resource utilization, EO constellations are expected to share communication spectrum and ground stations for maximum communication efficiency. In this paper, we investigate a new attack surface exposed by resource competition in EO constellations, targeting the delay or drop of Earth monitoring data using legitimate EO services. Specifically, an attacker can inject high-priority requests to temporarily preempt low-priority data transmission windows. Furthermore, we show that by utilizing predictable satellite dynamics, an attacker can intelligently target critical data from low-priority satellites, either delaying its delivery or irreversibly dropping the data. We formulate two attacks, the data delay attack and the data overflow attack, design algorithms to assist attackers in devising attack strategies, and analyze their feasibility or optimality in typical scenarios. We then conduct trace-driven simulations using real-world satellite images and orbit data to evaluate the success probability of launching these attacks under realistic satellite communication settings. We also discuss possible defenses against these attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00897v2-abstract-full').style.display = 'none'; document.getElementById('2409.00897v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16982">arXiv:2408.16982</a> <span> [<a href="https://arxiv.org/pdf/2408.16982">pdf</a>, <a href="https://arxiv.org/format/2408.16982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> 2DGH: 2D Gaussian-Hermite Splatting for High-quality Rendering and Better Geometry Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruihan Yu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tianyu Huang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+J">Jingwang Ling</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Feng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16982v1-abstract-short" style="display: inline;"> 2D Gaussian Splatting has recently emerged as a significant method in 3D reconstruction, enabling novel view synthesis and geometry reconstruction simultaneously. While the well-known Gaussian kernel is broadly used, its lack of anisotropy and deformation ability leads to dim and vague edges at object silhouettes, limiting the reconstruction quality of current Gaussian splatting methods. To enhanc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16982v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16982v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16982v1-abstract-full" style="display: none;"> 2D Gaussian Splatting has recently emerged as a significant method in 3D reconstruction, enabling novel view synthesis and geometry reconstruction simultaneously. While the well-known Gaussian kernel is broadly used, its lack of anisotropy and deformation ability leads to dim and vague edges at object silhouettes, limiting the reconstruction quality of current Gaussian splatting methods. To enhance the representation power, we draw inspiration from quantum physics and propose to use the Gaussian-Hermite kernel as the new primitive in Gaussian splatting. The new kernel takes a unified mathematical form and extends the Gaussian function, which serves as the zero-rank term in the updated formulation. Our experiments demonstrate the extraordinary performance of Gaussian-Hermite kernel in both geometry reconstruction and novel-view synthesis tasks. The proposed kernel outperforms traditional Gaussian Splatting kernels, showcasing its potential for high-quality 3D reconstruction and rendering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16982v1-abstract-full').style.display = 'none'; document.getElementById('2408.16982v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15516">arXiv:2408.15516</a> <span> [<a href="https://arxiv.org/pdf/2408.15516">pdf</a>, <a href="https://arxiv.org/format/2408.15516">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Predicting Parameter Change's Effect on Cellular Network Time Series </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingjie Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yongqian Sun</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+X">Xiaolei Hua</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Renkai Yu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xinwen Fan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lin Zhu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Junlan Feng</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+D">Dan Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15516v1-abstract-short" style="display: inline;"> The cellular network provides convenient network access for ever-growing mobile phones. During the continuous optimization, operators can adjust cell parameters to enhance the Quality of Service (QoS) flexibly. A precise prediction of the parameter change's effect can help operators make proper parameter adjustments. This work focuses on predicting cell status (like the workload and QoS) after adj… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15516v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15516v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15516v1-abstract-full" style="display: none;"> The cellular network provides convenient network access for ever-growing mobile phones. During the continuous optimization, operators can adjust cell parameters to enhance the Quality of Service (QoS) flexibly. A precise prediction of the parameter change's effect can help operators make proper parameter adjustments. This work focuses on predicting cell status (like the workload and QoS) after adjusting the cell parameters. The prediction will be conducted before an adjustment is actually applied to provide an early inspection. As it can be hard for available parameter adjustments with a limited number to cover all the parameter and user behavior combinations, we propose ParaSeer fusing domain knowledge on parameter adjustments into data-driven time series forecasting. ParaSeer organizes several pre-trained Transformers for adjustment-free time series forecasting, utilizing plenty of adjustment-free data. On the other hand, ParaSeer models the effect of adjusting the transmission power and cell individual offset (CIO) as a multiplier for the workload. We derive a formula to calculate the multiplier from the underlying mechanism of those two parameters, helping ParaSeer eliminate the thirst for data with parameter adjustments. We compare ParaSeer with baselines on two real-world datasets, where ParaSeer outperforms the best baseline by more than 25.8% in terms of RMSE. The extensive experiments further illustrate the contributions of ParaSeer's components. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15516v1-abstract-full').style.display = 'none'; document.getElementById('2408.15516v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15270">arXiv:2408.15270</a> <span> [<a href="https://arxiv.org/pdf/2408.15270">pdf</a>, <a href="https://arxiv.org/format/2408.15270">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SkillMimic: Learning Reusable Basketball Skills from Demonstrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yinhuai Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qihan Zhao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Runyi Yu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+A">Ailing Zeng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jing Lin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhengyi Luo</a>, <a href="/search/cs?searchtype=author&query=Tsui%2C+H+W">Hok Wai Tsui</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiwen Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiu Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+P">Ping Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15270v1-abstract-short" style="display: inline;"> Mastering basketball skills such as diverse layups and dribbling involves complex interactions with the ball and requires real-time adjustments. Traditional reinforcement learning methods for interaction skills rely on labor-intensive, manually designed rewards that do not generalize well across different skills. Inspired by how humans learn from demonstrations, we propose SkillMimic, a data-drive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15270v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15270v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15270v1-abstract-full" style="display: none;"> Mastering basketball skills such as diverse layups and dribbling involves complex interactions with the ball and requires real-time adjustments. Traditional reinforcement learning methods for interaction skills rely on labor-intensive, manually designed rewards that do not generalize well across different skills. Inspired by how humans learn from demonstrations, we propose SkillMimic, a data-driven approach that mimics both human and ball motions to learn a wide variety of basketball skills. SkillMimic employs a unified configuration to learn diverse skills from human-ball motion datasets, with skill diversity and generalization improving as the dataset grows. This approach allows training a single policy to learn multiple skills, enabling smooth skill switching even if these switches are not present in the reference dataset. The skills acquired by SkillMimic can be easily reused by a high-level controller to accomplish complex basketball tasks. To evaluate our approach, we introduce two basketball datasets: one estimated through monocular RGB videos and the other using advanced motion capture equipment, collectively containing about 35 minutes of diverse basketball skills. Experiments show that our method can effectively learn various basketball skills included in the dataset with a unified configuration, including various styles of dribbling, layups, and shooting. Furthermore, by training a high-level controller to reuse the acquired skills, we can achieve complex basketball tasks such as layup scoring, which involves dribbling toward the basket, timing the dribble and layup to score, retrieving the rebound, and repeating the process. The project page and video demonstrations are available at https://ingrid789.github.io/SkillMimic/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15270v1-abstract-full').style.display = 'none'; document.getElementById('2408.15270v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14997">arXiv:2408.14997</a> <span> [<a href="https://arxiv.org/pdf/2408.14997">pdf</a>, <a href="https://arxiv.org/format/2408.14997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Depth Restoration of Hand-Held Transparent Objects for Human-to-Robot Handover </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ran Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haixin Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shoujie Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Huang Yan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Ziwu Song</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+W">Wenbo Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14997v2-abstract-short" style="display: inline;"> Transparent objects are common in daily life, while their optical properties pose challenges for RGB-D cameras to capture accurate depth information. This issue is further amplified when these objects are hand-held, as hand occlusions further complicate depth estimation. For assistant robots, however, accurately perceiving hand-held transparent objects is critical to effective human-robot interact… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14997v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14997v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14997v2-abstract-full" style="display: none;"> Transparent objects are common in daily life, while their optical properties pose challenges for RGB-D cameras to capture accurate depth information. This issue is further amplified when these objects are hand-held, as hand occlusions further complicate depth estimation. For assistant robots, however, accurately perceiving hand-held transparent objects is critical to effective human-robot interaction. This paper presents a Hand-Aware Depth Restoration (HADR) method based on creating an implicit neural representation function from a single RGB-D image. The proposed method utilizes hand posture as an important guidance to leverage semantic and geometric information of hand-object interaction. To train and evaluate the proposed method, we create a high-fidelity synthetic dataset named TransHand-14K with a real-to-sim data generation scheme. Experiments show that our method has better performance and generalization ability compared with existing methods. We further develop a real-world human-to-robot handover system based on HADR, demonstrating its potential in human-robot interaction applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14997v2-abstract-full').style.display = 'none'; document.getElementById('2408.14997v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 7 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13740">arXiv:2408.13740</a> <span> [<a href="https://arxiv.org/pdf/2408.13740">pdf</a>, <a href="https://arxiv.org/format/2408.13740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> PIE: Parkour with Implicit-Explicit Learning Framework for Legged Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shixin Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Songbo Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruiqi Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jun Wu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiuguo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13740v3-abstract-short" style="display: inline;"> Parkour presents a highly challenging task for legged robots, requiring them to traverse various terrains with agile and smooth locomotion. This necessitates comprehensive understanding of both the robot's own state and the surrounding terrain, despite the inherent unreliability of robot perception and actuation. Current state-of-the-art methods either rely on complex pre-trained high-level terrai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13740v3-abstract-full').style.display = 'inline'; document.getElementById('2408.13740v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13740v3-abstract-full" style="display: none;"> Parkour presents a highly challenging task for legged robots, requiring them to traverse various terrains with agile and smooth locomotion. This necessitates comprehensive understanding of both the robot's own state and the surrounding terrain, despite the inherent unreliability of robot perception and actuation. Current state-of-the-art methods either rely on complex pre-trained high-level terrain reconstruction modules or limit the maximum potential of robot parkour to avoid failure due to inaccurate perception. In this paper, we propose a one-stage end-to-end learning-based parkour framework: Parkour with Implicit-Explicit learning framework for legged robots (PIE) that leverages dual-level implicit-explicit estimation. With this mechanism, even a low-cost quadruped robot equipped with an unreliable egocentric depth camera can achieve exceptional performance on challenging parkour terrains using a relatively simple training process and reward function. While the training process is conducted entirely in simulation, our real-world validation demonstrates successful zero-shot deployment of our framework, showcasing superior parkour performance on harsh terrains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13740v3-abstract-full').style.display = 'none'; document.getElementById('2408.13740v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for IEEE Robotics and Automation Letters (RA-L)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13501">arXiv:2408.13501</a> <span> [<a href="https://arxiv.org/pdf/2408.13501">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Utilizing Large Language Models for Named Entity Recognition in Traditional Chinese Medicine against COVID-19 Literature: Comparative Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tong%2C+X">Xu Tong</a>, <a href="/search/cs?searchtype=author&query=Smirnova%2C+N">Nina Smirnova</a>, <a href="/search/cs?searchtype=author&query=Upadhyaya%2C+S">Sharmila Upadhyaya</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ran Yu</a>, <a href="/search/cs?searchtype=author&query=Culbert%2C+J+H">Jack H. Culbert</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chao Sun</a>, <a href="/search/cs?searchtype=author&query=Otto%2C+W">Wolfgang Otto</a>, <a href="/search/cs?searchtype=author&query=Mayr%2C+P">Philipp Mayr</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13501v1-abstract-short" style="display: inline;"> Objective: To explore and compare the performance of ChatGPT and other state-of-the-art LLMs on domain-specific NER tasks covering different entity types and domains in TCM against COVID-19 literature. Methods: We established a dataset of 389 articles on TCM against COVID-19, and manually annotated 48 of them with 6 types of entities belonging to 3 domains as the ground truth, against which the NE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13501v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13501v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13501v1-abstract-full" style="display: none;"> Objective: To explore and compare the performance of ChatGPT and other state-of-the-art LLMs on domain-specific NER tasks covering different entity types and domains in TCM against COVID-19 literature. Methods: We established a dataset of 389 articles on TCM against COVID-19, and manually annotated 48 of them with 6 types of entities belonging to 3 domains as the ground truth, against which the NER performance of LLMs can be assessed. We then performed NER tasks for the 6 entity types using ChatGPT (GPT-3.5 and GPT-4) and 4 state-of-the-art BERT-based question-answering (QA) models (RoBERTa, MiniLM, PubMedBERT and SciBERT) without prior training on the specific task. A domain fine-tuned model (GSAP-NER) was also applied for a comprehensive comparison. Results: The overall performance of LLMs varied significantly in exact match and fuzzy match. In the fuzzy match, ChatGPT surpassed BERT-based QA models in 5 out of 6 tasks, while in exact match, BERT-based QA models outperformed ChatGPT in 5 out of 6 tasks but with a smaller F-1 difference. GPT-4 showed a significant advantage over other models in fuzzy match, especially on the entity type of TCM formula and the Chinese patent drug (TFD) and ingredient (IG). Although GPT-4 outperformed BERT-based models on entity type of herb, target, and research method, none of the F-1 scores exceeded 0.5. GSAP-NER, outperformed GPT-4 in terms of F-1 by a slight margin on RM. ChatGPT achieved considerably higher recalls than precisions, particularly in the fuzzy match. Conclusions: The NER performance of LLMs is highly dependent on the entity type, and their performance varies across application scenarios. ChatGPT could be a good choice for scenarios where high recall is favored. However, for knowledge acquisition in rigorous scenarios, neither ChatGPT nor BERT-based QA models are off-the-shelf tools for professional practitioners. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13501v1-abstract-full').style.display = 'none'; document.getElementById('2408.13501v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages with 2 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.3.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10903">arXiv:2408.10903</a> <span> [<a href="https://arxiv.org/pdf/2408.10903">pdf</a>, <a href="https://arxiv.org/format/2408.10903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General Role-Playing Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yeyong Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Runsheng Yu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Haojie Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhanqiu Zhang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Q">Quan Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10903v5-abstract-short" style="display: inline;"> The rapid advancement of large language models (LLMs) has revolutionized role-playing, enabling the development of general role-playing models. However, current role-playing training has two significant issues: (I) Using a predefined role profile to prompt dialogue training for specific scenarios usually leads to inconsistencies and even conflicts between the dialogue and the profile, resulting in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10903v5-abstract-full').style.display = 'inline'; document.getElementById('2408.10903v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10903v5-abstract-full" style="display: none;"> The rapid advancement of large language models (LLMs) has revolutionized role-playing, enabling the development of general role-playing models. However, current role-playing training has two significant issues: (I) Using a predefined role profile to prompt dialogue training for specific scenarios usually leads to inconsistencies and even conflicts between the dialogue and the profile, resulting in training biases. (II) The model learns to imitate the role based solely on the profile, neglecting profile-dialogue alignment at the sentence level. In this work, we propose a simple yet effective framework called BEYOND DIALOGUE, designed to overcome these hurdles. This framework innovatively introduces "beyond dialogue" tasks to align dialogue with profile traits based on each specific scenario, thereby eliminating biases during training. Furthermore, by adopting an innovative prompting mechanism that generates reasoning outcomes for training, the framework allows the model to achieve fine-grained alignment between profile and dialogue at the sentence level. The aforementioned methods are fully automated and low-cost. Additionally, the integration of automated dialogue and objective evaluation methods forms a comprehensive framework, paving the way for general role-playing. Experimental results demonstrate that our model excels in adhering to and reflecting various dimensions of role profiles, outperforming most proprietary general and specialized role-playing baselines. All code and datasets are available at https://github.com/yuyouyu32/BeyondDialogue. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10903v5-abstract-full').style.display = 'none'; document.getElementById('2408.10903v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10774">arXiv:2408.10774</a> <span> [<a href="https://arxiv.org/pdf/2408.10774">pdf</a>, <a href="https://arxiv.org/format/2408.10774">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Flexora: Flexible Low Rank Adaptation for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+C">Chenxing Wei</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Y">Yao Shu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y+T">Ying Tiffany He</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">Fei Richard Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10774v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) are driving advancements in artificial intelligence by increasing the scale of model parameters, which has significantly enhanced generalization ability and unlocked new capabilities in practice. However, their performance in specific downstream tasks is usually hindered by their knowledge boundaries on these tasks. Thus, fine-tuning techniques, especially the widely u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10774v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10774v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10774v2-abstract-full" style="display: none;"> Large Language Models (LLMs) are driving advancements in artificial intelligence by increasing the scale of model parameters, which has significantly enhanced generalization ability and unlocked new capabilities in practice. However, their performance in specific downstream tasks is usually hindered by their knowledge boundaries on these tasks. Thus, fine-tuning techniques, especially the widely used Low-Rank Adaptation (LoRA) method, have been introduced to expand the boundaries on these tasks, whereas LoRA would underperform on certain tasks owing to its potential overfitting on these tasks. To overcome this overfitting and improve the performance of LoRA, we propose the flexible low rank adaptation (Flexora) method to automatically and flexibly select the most important layers needing to be fine-tuned to achieve the best performance on different downstream tasks. Specifically, Flexora firstly frames this layer selection problem as a well-defined hyperparameter optimization (HPO) problem, then addresses it using the unrolled differentiation (UD) method, and finally selects the most useful layers based on the optimized hyperparameters. Our extensive experiments on many pretrained models and natural language tasks show that Flexora is able to consistently improve over the existing baselines, indicating the effectiveness of our Flexora in practice. We additionally provide insightful theoretical results and many ablation studies to deliver a comprehensive understanding of our Flexora. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10774v2-abstract-full').style.display = 'none'; document.getElementById('2408.10774v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09698">arXiv:2408.09698</a> <span> [<a href="https://arxiv.org/pdf/2408.09698">pdf</a>, <a href="https://arxiv.org/format/2408.09698">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Harnessing Multimodal Large Language Models for Multimodal Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yuyang Ye</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhi Zheng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yishan Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianshu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hengruo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+P">Peijun Zhu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Runlong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09698v4-abstract-short" style="display: inline;"> Recent advances in Large Language Models (LLMs) have demonstrated significant potential in the field of Recommendation Systems (RSs). Most existing studies have focused on converting user behavior logs into textual prompts and leveraging techniques such as prompt tuning to enable LLMs for recommendation tasks. Meanwhile, research interest has recently grown in multimodal recommendation systems tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09698v4-abstract-full').style.display = 'inline'; document.getElementById('2408.09698v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09698v4-abstract-full" style="display: none;"> Recent advances in Large Language Models (LLMs) have demonstrated significant potential in the field of Recommendation Systems (RSs). Most existing studies have focused on converting user behavior logs into textual prompts and leveraging techniques such as prompt tuning to enable LLMs for recommendation tasks. Meanwhile, research interest has recently grown in multimodal recommendation systems that integrate data from images, text, and other sources using modality fusion techniques. This introduces new challenges to the existing LLM-based recommendation paradigm which relies solely on text modality information. Moreover, although Multimodal Large Language Models (MLLMs) capable of processing multi-modal inputs have emerged, how to equip MLLMs with multi-modal recommendation capabilities remains largely unexplored. To this end, in this paper, we propose the Multimodal Large Language Model-enhanced Multimodaln Sequential Recommendation (MLLM-MSR) model. To capture the dynamic user preference, we design a two-stage user preference summarization method. Specifically, we first utilize an MLLM-based item-summarizer to extract image feature given an item and convert the image into text. Then, we employ a recurrent user preference summarization generation paradigm to capture the dynamic changes in user preferences based on an LLM-based user-summarizer. Finally, to enable the MLLM for multi-modal recommendation task, we propose to fine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT) techniques. Extensive evaluations across various datasets validate the effectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt to the evolving dynamics of user preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09698v4-abstract-full').style.display = 'none'; document.getElementById('2408.09698v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08201">arXiv:2408.08201</a> <span> [<a href="https://arxiv.org/pdf/2408.08201">pdf</a>, <a href="https://arxiv.org/format/2408.08201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Heavy Labels Out! Dataset Distillation with Label Space Lightening </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruonan Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songhua Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zigeng Chen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jingwen Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08201v1-abstract-short" style="display: inline;"> Dataset distillation or condensation aims to condense a large-scale training dataset into a much smaller synthetic one such that the training performance of distilled and original sets on neural networks are similar. Although the number of training samples can be reduced substantially, current state-of-the-art methods heavily rely on enormous soft labels to achieve satisfactory performance. As a r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08201v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08201v1-abstract-full" style="display: none;"> Dataset distillation or condensation aims to condense a large-scale training dataset into a much smaller synthetic one such that the training performance of distilled and original sets on neural networks are similar. Although the number of training samples can be reduced substantially, current state-of-the-art methods heavily rely on enormous soft labels to achieve satisfactory performance. As a result, the required storage can be comparable even to original datasets, especially for large-scale ones. To solve this problem, instead of storing these heavy labels, we propose a novel label-lightening framework termed HeLlO aiming at effective image-to-label projectors, with which synthetic labels can be directly generated online from synthetic images. Specifically, to construct such projectors, we leverage prior knowledge in open-source foundation models, e.g., CLIP, and introduce a LoRA-like fine-tuning strategy to mitigate the gap between pre-trained and target distributions, so that original models for soft-label generation can be distilled into a group of low-rank matrices. Moreover, an effective image optimization method is proposed to further mitigate the potential error between the original and distilled label generators. Extensive experiments demonstrate that with only about 0.003% of the original storage required for a complete set of soft labels, we achieve comparable performance to current state-of-the-art dataset distillation methods on large-scale datasets. Our code will be available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08201v1-abstract-full').style.display = 'none'; document.getElementById('2408.08201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07490">arXiv:2408.07490</a> <span> [<a href="https://arxiv.org/pdf/2408.07490">pdf</a>, <a href="https://arxiv.org/format/2408.07490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Attention-Guided Perturbation for Unsupervised Image Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tingfeng Huang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuxuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jingbo Xia</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rui Yu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuxuan Cai</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+J">Jinhai Xiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xinwei He</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiang Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07490v1-abstract-short" style="display: inline;"> Reconstruction-based methods have significantly advanced modern unsupervised anomaly detection. However, the strong capacity of neural networks often violates the underlying assumptions by reconstructing abnormal samples well. To alleviate this issue, we present a simple yet effective reconstruction framework named Attention-Guided Pertuation Network (AGPNet), which learns to add perturbation nois… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07490v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07490v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07490v1-abstract-full" style="display: none;"> Reconstruction-based methods have significantly advanced modern unsupervised anomaly detection. However, the strong capacity of neural networks often violates the underlying assumptions by reconstructing abnormal samples well. To alleviate this issue, we present a simple yet effective reconstruction framework named Attention-Guided Pertuation Network (AGPNet), which learns to add perturbation noise with an attention mask, for accurate unsupervised anomaly detection. Specifically, it consists of two branches, \ie, a plain reconstruction branch and an auxiliary attention-based perturbation branch. The reconstruction branch is simply a plain reconstruction network that learns to reconstruct normal samples, while the auxiliary branch aims to produce attention masks to guide the noise perturbation process for normal samples from easy to hard. By doing so, we are expecting to synthesize hard yet more informative anomalies for training, which enable the reconstruction branch to learn important inherent normal patterns both comprehensively and efficiently. Extensive experiments are conducted on three popular benchmarks covering MVTec-AD, VisA, and MVTec-3D, and show that our framework obtains leading anomaly detection performance under various setups including few-shot, one-class, and multi-class setups. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07490v1-abstract-full').style.display = 'none'; document.getElementById('2408.07490v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06656">arXiv:2408.06656</a> <span> [<a href="https://arxiv.org/pdf/2408.06656">pdf</a>, <a href="https://arxiv.org/format/2408.06656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> MAPPO-PIS: A Multi-Agent Proximal Policy Optimization Method with Prior Intent Sharing for CAVs' Cooperative Decision-Making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yicheng Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaqi Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rongjie Yu</a>, <a href="/search/cs?searchtype=author&query=Hang%2C+P">Peng Hang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jian Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06656v2-abstract-short" style="display: inline;"> Vehicle-to-Vehicle (V2V) technologies have great potential for enhancing traffic flow efficiency and safety. However, cooperative decision-making in multi-agent systems, particularly in complex human-machine mixed merging areas, remains challenging for connected and autonomous vehicles (CAVs). Intent sharing, a key aspect of human coordination, may offer an effective solution to these decision-mak… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06656v2-abstract-full').style.display = 'inline'; document.getElementById('2408.06656v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06656v2-abstract-full" style="display: none;"> Vehicle-to-Vehicle (V2V) technologies have great potential for enhancing traffic flow efficiency and safety. However, cooperative decision-making in multi-agent systems, particularly in complex human-machine mixed merging areas, remains challenging for connected and autonomous vehicles (CAVs). Intent sharing, a key aspect of human coordination, may offer an effective solution to these decision-making problems, but its application in CAVs is under-explored. This paper presents an intent-sharing-based cooperative method, the Multi-Agent Proximal Policy Optimization with Prior Intent Sharing (MAPPO-PIS), which models the CAV cooperative decision-making problem as a Multi-Agent Reinforcement Learning (MARL) problem. It involves training and updating the agents' policies through the integration of two key modules: the Intention Generator Module (IGM) and the Safety Enhanced Module (SEM). The IGM is specifically crafted to generate and disseminate CAVs' intended trajectories spanning multiple future time-steps. On the other hand, the SEM serves a crucial role in assessing the safety of the decisions made and rectifying them if necessary. Merging area with human-machine mixed traffic flow is selected to validate our method. Results show that MAPPO-PIS significantly improves decision-making performance in multi-agent systems, surpassing state-of-the-art baselines in safety, efficiency, and overall traffic system performance. The code and video demo can be found at: \url{https://github.com/CCCC1dhcgd/A-MAPPO-PIS}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06656v2-abstract-full').style.display = 'none'; document.getElementById('2408.06656v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04590">arXiv:2408.04590</a> <span> [<a href="https://arxiv.org/pdf/2408.04590">pdf</a>, <a href="https://arxiv.org/format/2408.04590">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learn To Learn More Precisely </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+R">Runxi Cheng</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yongxian Wei</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xianglong He</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wanyun Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Songsong Huang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">Fei Richard Yu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fei Ma</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chun Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04590v1-abstract-short" style="display: inline;"> Meta-learning has been extensively applied in the domains of few-shot learning and fast adaptation, achieving remarkable performance. While Meta-learning methods like Model-Agnostic Meta-Learning (MAML) and its variants provide a good set of initial parameters for the model, the model still tends to learn shortcut features, which leads to poor generalization. In this paper, we propose the formal c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04590v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04590v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04590v1-abstract-full" style="display: none;"> Meta-learning has been extensively applied in the domains of few-shot learning and fast adaptation, achieving remarkable performance. While Meta-learning methods like Model-Agnostic Meta-Learning (MAML) and its variants provide a good set of initial parameters for the model, the model still tends to learn shortcut features, which leads to poor generalization. In this paper, we propose the formal conception of "learn to learn more precisely", which aims to make the model learn precise target knowledge from data and reduce the effect of noisy knowledge, such as background and noise. To achieve this target, we proposed a simple and effective meta-learning framework named Meta Self-Distillation(MSD) to maximize the consistency of learned knowledge, enhancing the models' ability to learn precise target knowledge. In the inner loop, MSD uses different augmented views of the same support data to update the model respectively. Then in the outer loop, MSD utilizes the same query data to optimize the consistency of learned knowledge, enhancing the model's ability to learn more precisely. Our experiment demonstrates that MSD exhibits remarkable performance in few-shot classification tasks in both standard and augmented scenarios, effectively boosting the accuracy and consistency of knowledge learned by the model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04590v1-abstract-full').style.display = 'none'; document.getElementById('2408.04590v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10pages,4 figures, meta learning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16674">arXiv:2407.16674</a> <span> [<a href="https://arxiv.org/pdf/2407.16674">pdf</a>, <a href="https://arxiv.org/format/2407.16674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KAN or MLP: A Fairer Comparison </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Runpeng Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weihao Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16674v2-abstract-short" style="display: inline;"> This paper does not introduce a novel method. Instead, it offers a fairer and more comprehensive comparison of KAN and MLP models across various tasks, including machine learning, computer vision, audio processing, natural language processing, and symbolic formula representation. Specifically, we control the number of parameters and FLOPs to compare the performance of KAN and MLP. Our main observa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16674v2-abstract-full').style.display = 'inline'; document.getElementById('2407.16674v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16674v2-abstract-full" style="display: none;"> This paper does not introduce a novel method. Instead, it offers a fairer and more comprehensive comparison of KAN and MLP models across various tasks, including machine learning, computer vision, audio processing, natural language processing, and symbolic formula representation. Specifically, we control the number of parameters and FLOPs to compare the performance of KAN and MLP. Our main observation is that, except for symbolic formula representation tasks, MLP generally outperforms KAN. We also conduct ablation studies on KAN and find that its advantage in symbolic formula representation mainly stems from its B-spline activation function. When B-spline is applied to MLP, performance in symbolic formula representation significantly improves, surpassing or matching that of KAN. However, in other tasks where MLP already excels over KAN, B-spline does not substantially enhance MLP's performance. Furthermore, we find that KAN's forgetting issue is more severe than that of MLP in a standard class-incremental continual learning setting, which differs from the findings reported in the KAN paper. We hope these results provide insights for future research on KAN and other MLP alternatives. Project link: https://github.com/yu-rp/KANbeFair <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16674v2-abstract-full').style.display = 'none'; document.getElementById('2407.16674v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15141">arXiv:2407.15141</a> <span> [<a href="https://arxiv.org/pdf/2407.15141">pdf</a>, <a href="https://arxiv.org/format/2407.15141">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> </div> </div> <p class="title is-5 mathjax"> Text-Augmented Multimodal LLMs for Chemical Reaction Condition Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruijie Yu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+K">Kaipeng Zeng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Ding Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+F">Feng Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaokang Yang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yaohui Jin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yanyan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15141v1-abstract-short" style="display: inline;"> High-throughput reaction condition (RC) screening is fundamental to chemical synthesis. However, current RC screening suffers from laborious and costly trial-and-error workflows. Traditional computer-aided synthesis planning (CASP) tools fail to find suitable RCs due to data sparsity and inadequate reaction representations. Nowadays, large language models (LLMs) are capable of tackling chemistry-r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15141v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15141v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15141v1-abstract-full" style="display: none;"> High-throughput reaction condition (RC) screening is fundamental to chemical synthesis. However, current RC screening suffers from laborious and costly trial-and-error workflows. Traditional computer-aided synthesis planning (CASP) tools fail to find suitable RCs due to data sparsity and inadequate reaction representations. Nowadays, large language models (LLMs) are capable of tackling chemistry-related problems, such as molecule design, and chemical logic Q\&A tasks. However, LLMs have not yet achieved accurate predictions of chemical reaction conditions. Here, we present MM-RCR, a text-augmented multimodal LLM that learns a unified reaction representation from SMILES, reaction graphs, and textual corpus for chemical reaction recommendation (RCR). To train MM-RCR, we construct 1.2 million pair-wised Q\&A instruction datasets. Our experimental results demonstrate that MM-RCR achieves state-of-the-art performance on two open benchmark datasets and exhibits strong generalization capabilities on out-of-domain (OOD) and High-Throughput Experimentation (HTE) datasets. MM-RCR has the potential to accelerate high-throughput condition screening in chemical synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15141v1-abstract-full').style.display = 'none'; document.getElementById('2407.15141v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14968">arXiv:2407.14968</a> <span> [<a href="https://arxiv.org/pdf/2407.14968">pdf</a>, <a href="https://arxiv.org/format/2407.14968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Technical report: Improving the properties of molecules generated by LIMO </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Thumuluri%2C+V">Vineet Thumuluri</a>, <a href="/search/cs?searchtype=author&query=Eckmann%2C+P">Peter Eckmann</a>, <a href="/search/cs?searchtype=author&query=Gilson%2C+M+K">Michael K. Gilson</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rose Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14968v1-abstract-short" style="display: inline;"> This technical report investigates variants of the Latent Inceptionism on Molecules (LIMO) framework to improve the properties of generated molecules. We conduct ablative studies of molecular representation, decoder model, and surrogate model training scheme. The experiments suggest that an autogressive Transformer decoder with GroupSELFIES achieves the best average properties for the random gener… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14968v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14968v1-abstract-full" style="display: none;"> This technical report investigates variants of the Latent Inceptionism on Molecules (LIMO) framework to improve the properties of generated molecules. We conduct ablative studies of molecular representation, decoder model, and surrogate model training scheme. The experiments suggest that an autogressive Transformer decoder with GroupSELFIES achieves the best average properties for the random generation task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14968v1-abstract-full').style.display = 'none'; document.getElementById('2407.14968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11902">arXiv:2407.11902</a> <span> [<a href="https://arxiv.org/pdf/2407.11902">pdf</a>, <a href="https://arxiv.org/format/2407.11902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Encapsulating Knowledge in One Prompt </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Runpeng Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11902v1-abstract-short" style="display: inline;"> This paradigm encapsulates knowledge from various models into a solitary prompt without altering the original models or requiring access to the training data, which enables us to achieve efficient and convenient knowledge transfer in more realistic scenarios. From a practicality standpoint, this paradigm not only for the first time proves the effectiveness of Visual Prompt in data inaccessible con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11902v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11902v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11902v1-abstract-full" style="display: none;"> This paradigm encapsulates knowledge from various models into a solitary prompt without altering the original models or requiring access to the training data, which enables us to achieve efficient and convenient knowledge transfer in more realistic scenarios. From a practicality standpoint, this paradigm not only for the first time proves the effectiveness of Visual Prompt in data inaccessible contexts, but also solves the problems of low model reusability and high storage resource consumption faced by traditional Data-Free Knowledge Transfer, which means that we can realize the parallel knowledge transfer of multiple models without modifying any source model. Extensive experiments across various datasets and models demonstrate the efficacy of the proposed KiOP knowledge transfer paradigm. Without access to real training data and with rigorous storage capacity constraints, it is also capable of yielding considerable outcomes when dealing with cross-model backbone setups and handling parallel knowledge transfer processing requests with multiple (more than 2) models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11902v1-abstract-full').style.display = 'none'; document.getElementById('2407.11902v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11203">arXiv:2407.11203</a> <span> [<a href="https://arxiv.org/pdf/2407.11203">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Life Cycle of Large Language Models: A Review of Biases in Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jinsook Lee</a>, <a href="/search/cs?searchtype=author&query=Hicke%2C+Y">Yann Hicke</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Renzhe Yu</a>, <a href="/search/cs?searchtype=author&query=Brooks%2C+C">Christopher Brooks</a>, <a href="/search/cs?searchtype=author&query=Kizilcec%2C+R+F">Ren茅 F. Kizilcec</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11203v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are increasingly adopted in educational contexts to provide personalized support to students and teachers. The unprecedented capacity of LLM-based applications to understand and generate natural language can potentially improve instructional effectiveness and learning outcomes, but the integration of LLMs in education technology has renewed concerns over algorithmic bi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11203v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11203v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11203v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are increasingly adopted in educational contexts to provide personalized support to students and teachers. The unprecedented capacity of LLM-based applications to understand and generate natural language can potentially improve instructional effectiveness and learning outcomes, but the integration of LLMs in education technology has renewed concerns over algorithmic bias which may exacerbate educational inequities. In this review, building on prior work on mapping the traditional machine learning life cycle, we provide a holistic map of the LLM life cycle from the initial development of LLMs to customizing pre-trained models for various applications in educational settings. We explain each step in the LLM life cycle and identify potential sources of bias that may arise in the context of education. We discuss why current measures of bias from traditional machine learning fail to transfer to LLM-generated content in education, such as tutoring conversations because the text is high-dimensional, there can be multiple correct responses, and tailoring responses may be pedagogically desirable rather than unfair. This review aims to clarify the complex nature of bias in LLM applications and provide practical guidance for their evaluation to promote educational equity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11203v1-abstract-full').style.display = 'none'; document.getElementById('2407.11203v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 2 figures, preprint for British Journal of Educational Technology submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11036">arXiv:2407.11036</a> <span> [<a href="https://arxiv.org/pdf/2407.11036">pdf</a>, <a href="https://arxiv.org/format/2407.11036">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Hybrid-Generative Diffusion Models for Attack-Oriented Twin Migration in Vehicular Metaverses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yingkai Kang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jinbo Wen</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rong Yu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shengli Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11036v1-abstract-short" style="display: inline;"> The vehicular metaverse is envisioned as a blended immersive domain that promises to bring revolutionary changes to the automotive industry. As a core component of vehicular metaverses, Vehicle Twins (VTs) are digital twins that cover the entire life cycle of vehicles, providing immersive virtual services for Vehicular Metaverse Users (VMUs). Vehicles with limited resources offload the computation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11036v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11036v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11036v1-abstract-full" style="display: none;"> The vehicular metaverse is envisioned as a blended immersive domain that promises to bring revolutionary changes to the automotive industry. As a core component of vehicular metaverses, Vehicle Twins (VTs) are digital twins that cover the entire life cycle of vehicles, providing immersive virtual services for Vehicular Metaverse Users (VMUs). Vehicles with limited resources offload the computationally intensive tasks of constructing and updating VTs to edge servers and migrate VTs between these servers, ensuring seamless and immersive experiences for VMUs. However, the high mobility of vehicles, uneven deployment of edge servers, and potential security threats pose challenges to achieving efficient and reliable VT migrations. To address these issues, we propose a secure and reliable VT migration framework in vehicular metaverses. Specifically, we design a two-layer trust evaluation model to comprehensively evaluate the reputation value of edge servers in the network communication and interaction layers. Then, we model the VT migration problem as a partially observable Markov decision process and design a hybrid-Generative Diffusion Model (GDM) algorithm based on deep reinforcement learning to generate optimal migration decisions by taking hybrid actions (i.e., continuous actions and discrete actions). Numerical results demonstrate that the hybrid-GDM algorithm outperforms the baseline algorithms, showing strong adaptability in various settings and highlighting the potential of the hybrid-GDM algorithm for addressing various optimization issues in vehicular metaverses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11036v1-abstract-full').style.display = 'none'; document.getElementById('2407.11036v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yu%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Yu%2C+R&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository