Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,097 results for author: <span class="mathjax">Zhu, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhu%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhu, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhu%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhu, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16503">arXiv:2411.16503</a> <span> [<a href="https://arxiv.org/pdf/2411.16503">pdf</a>, <a href="https://arxiv.org/format/2411.16503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Noise Diffusion for Enhancing Semantic Faithfulness in Text-to-Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+B">Boming Miao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunxiao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Andi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Rui Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zizhe Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16503v1-abstract-short" style="display: inline;"> Diffusion models have achieved impressive success in generating photorealistic images, but challenges remain in ensuring precise semantic alignment with input prompts. Optimizing the initial noisy latent offers a more efficient alternative to modifying model architectures or prompt engineering for improving semantic alignment. A latest approach, InitNo, refines the initial noisy latent by leveragi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16503v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16503v1-abstract-full" style="display: none;"> Diffusion models have achieved impressive success in generating photorealistic images, but challenges remain in ensuring precise semantic alignment with input prompts. Optimizing the initial noisy latent offers a more efficient alternative to modifying model architectures or prompt engineering for improving semantic alignment. A latest approach, InitNo, refines the initial noisy latent by leveraging attention maps; however, these maps capture only limited information, and the effectiveness of InitNo is highly dependent on the initial starting point, as it tends to converge on a local optimum near this point. To this end, this paper proposes leveraging the language comprehension capabilities of large vision-language models (LVLMs) to guide the optimization of the initial noisy latent, and introduces the Noise Diffusion process, which updates the noisy latent to generate semantically faithful images while preserving distribution consistency. Furthermore, we provide a theoretical analysis of the condition under which the update improves semantic faithfulness. Experimental results demonstrate the effectiveness and adaptability of our framework, consistently enhancing semantic alignment across various diffusion models. The code is available at https://github.com/Bomingmiao/NoiseDiffusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16503v1-abstract-full').style.display = 'none'; document.getElementById('2411.16503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15808">arXiv:2411.15808</a> <span> [<a href="https://arxiv.org/pdf/2411.15808">pdf</a>, <a href="https://arxiv.org/format/2411.15808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LRSAA: Large-scale Remote Sensing Image Target Recognition and Automatic Annotation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+W">Wuzheng Dong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yujuan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15808v2-abstract-short" style="display: inline;"> This paper presents a method for object recognition and automatic labeling in large-area remote sensing images called LRSAA. The method integrates YOLOv11 and MobileNetV3-SSD object detection algorithms through ensemble learning to enhance model performance. Furthermore, it employs Poisson disk sampling segmentation techniques and the EIOU metric to optimize the training and inference processes of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15808v2-abstract-full').style.display = 'inline'; document.getElementById('2411.15808v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15808v2-abstract-full" style="display: none;"> This paper presents a method for object recognition and automatic labeling in large-area remote sensing images called LRSAA. The method integrates YOLOv11 and MobileNetV3-SSD object detection algorithms through ensemble learning to enhance model performance. Furthermore, it employs Poisson disk sampling segmentation techniques and the EIOU metric to optimize the training and inference processes of segmented images, followed by the integration of results. This approach not only reduces the demand for computational resources but also achieves a good balance between accuracy and speed. The source code for this project has been made publicly available on https://github.com/anaerovane/LRSAA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15808v2-abstract-full').style.display = 'none'; document.getElementById('2411.15808v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2411.07802</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15215">arXiv:2411.15215</a> <span> [<a href="https://arxiv.org/pdf/2411.15215">pdf</a>, <a href="https://arxiv.org/format/2411.15215">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> S$^2$ALM: Sequence-Structure Pre-trained Large Language Model for Comprehensive Antibody Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+M">Mingze Yin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hanjing Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jialu Wu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yuxuan Zhan</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Z">Zitai Kong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongxia Xu</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+C">Chang-Yu Hsieh</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jintai Chen</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+T">Tingjun Hou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15215v1-abstract-short" style="display: inline;"> Antibodies safeguard our health through their precise and potent binding to specific antigens, demonstrating promising therapeutic efficacy in the treatment of numerous diseases, including COVID-19. Recent advancements in biomedical language models have shown the great potential to interpret complex biological structures and functions. However, existing antibody specific models have a notable limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15215v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15215v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15215v1-abstract-full" style="display: none;"> Antibodies safeguard our health through their precise and potent binding to specific antigens, demonstrating promising therapeutic efficacy in the treatment of numerous diseases, including COVID-19. Recent advancements in biomedical language models have shown the great potential to interpret complex biological structures and functions. However, existing antibody specific models have a notable limitation that they lack explicit consideration for antibody structural information, despite the fact that both 1D sequence and 3D structure carry unique and complementary insights into antibody behavior and functionality. This paper proposes Sequence-Structure multi-level pre-trained Antibody Language Model (S$^2$ALM), combining holistic sequential and structural information in one unified, generic antibody foundation model. We construct a hierarchical pre-training paradigm incorporated with two customized multi-level training objectives to facilitate the modeling of comprehensive antibody representations. S$^2$ALM's representation space uncovers inherent functional binding mechanisms, biological evolution properties and structural interaction patterns. Pre-trained over 75 million sequences and 11.7 million structures, S$^2$ALM can be adopted for diverse downstream tasks: accurately predicting antigen-antibody binding affinities, precisely distinguishing B cell maturation stages, identifying antibody crucial binding positions, and specifically designing novel coronavirus-binding antibodies. Remarkably, S$^2$ALM outperforms well-established and renowned baselines and sets new state-of-the-art performance across extensive antibody specific understanding and generation tasks. S$^2$ALM's ability to model comprehensive and generalized representations further positions its potential to advance real-world therapeutic antibody development, potentially addressing unmet academic, industrial, and clinical needs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15215v1-abstract-full').style.display = 'none'; document.getElementById('2411.15215v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14716">arXiv:2411.14716</a> <span> [<a href="https://arxiv.org/pdf/2411.14716">pdf</a>, <a href="https://arxiv.org/format/2411.14716">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> VisionPAD: A Vision-Centric Pre-training Paradigm for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wending Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiyao Zhu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jiantao Gao</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+D">Dongfeng Bai</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yingjie Cai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingbing Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shuguang Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14716v1-abstract-short" style="display: inline;"> This paper introduces VisionPAD, a novel self-supervised pre-training paradigm designed for vision-centric algorithms in autonomous driving. In contrast to previous approaches that employ neural rendering with explicit depth supervision, VisionPAD utilizes more efficient 3D Gaussian Splatting to reconstruct multi-view representations using only images as supervision. Specifically, we introduce a s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14716v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14716v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14716v1-abstract-full" style="display: none;"> This paper introduces VisionPAD, a novel self-supervised pre-training paradigm designed for vision-centric algorithms in autonomous driving. In contrast to previous approaches that employ neural rendering with explicit depth supervision, VisionPAD utilizes more efficient 3D Gaussian Splatting to reconstruct multi-view representations using only images as supervision. Specifically, we introduce a self-supervised method for voxel velocity estimation. By warping voxels to adjacent frames and supervising the rendered outputs, the model effectively learns motion cues in the sequential data. Furthermore, we adopt a multi-frame photometric consistency approach to enhance geometric perception. It projects adjacent frames to the current frame based on rendered depths and relative poses, boosting the 3D geometric representation through pure image supervision. Extensive experiments on autonomous driving datasets demonstrate that VisionPAD significantly improves performance in 3D object detection, occupancy prediction and map segmentation, surpassing state-of-the-art pre-training strategies by a considerable margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14716v1-abstract-full').style.display = 'none'; document.getElementById('2411.14716v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14269">arXiv:2411.14269</a> <span> [<a href="https://arxiv.org/pdf/2411.14269">pdf</a>, <a href="https://arxiv.org/format/2411.14269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Guided MRI Reconstruction via Schr枚dinger Bridge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tian Zhou</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhuo-xu Cui</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Bingsheng Huang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hairong Zheng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+D">Dong Liang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanjie Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14269v1-abstract-short" style="display: inline;"> Magnetic Resonance Imaging (MRI) is a multi-contrast imaging technique in which different contrast images share similar structural information. However, conventional diffusion models struggle to effectively leverage this structural similarity. Recently, the Schr枚dinger Bridge (SB), a nonlinear extension of the diffusion model, has been proposed to establish diffusion paths between any distribution… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14269v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14269v1-abstract-full" style="display: none;"> Magnetic Resonance Imaging (MRI) is a multi-contrast imaging technique in which different contrast images share similar structural information. However, conventional diffusion models struggle to effectively leverage this structural similarity. Recently, the Schr枚dinger Bridge (SB), a nonlinear extension of the diffusion model, has been proposed to establish diffusion paths between any distributions, allowing the incorporation of guided priors. This study proposes an SB-based, multi-contrast image-guided reconstruction framework that establishes a diffusion bridge between the guiding and target image distributions. By using the guiding image along with data consistency during sampling, the target image is reconstructed more accurately. To better address structural differences between images, we introduce an inversion strategy from the field of image editing, termed $\mathbf{I}^2$SB-inversion. Experiments on a paried T1 and T2-FLAIR datasets demonstrate that $\mathbf{I}^2$SB-inversion achieve a high acceleration up to 14.4 and outperforms existing methods in terms of both reconstruction accuracy and stability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14269v1-abstract-full').style.display = 'none'; document.getElementById('2411.14269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14256">arXiv:2411.14256</a> <span> [<a href="https://arxiv.org/pdf/2411.14256">pdf</a>, <a href="https://arxiv.org/format/2411.14256">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Generalizing End-To-End Autonomous Driving In Real-World Environments Using Zero-Shot LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zeyu Dong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yimin Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yansong Li</a>, <a href="/search/cs?searchtype=author&query=Mahon%2C+K">Kevin Mahon</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yu Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14256v1-abstract-short" style="display: inline;"> Traditional autonomous driving methods adopt a modular design, decomposing tasks into sub-tasks. In contrast, end-to-end autonomous driving directly outputs actions from raw sensor data, avoiding error accumulation. However, training an end-to-end model requires a comprehensive dataset; otherwise, the model exhibits poor generalization capabilities. Recently, large language models (LLMs) have been… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14256v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14256v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14256v1-abstract-full" style="display: none;"> Traditional autonomous driving methods adopt a modular design, decomposing tasks into sub-tasks. In contrast, end-to-end autonomous driving directly outputs actions from raw sensor data, avoiding error accumulation. However, training an end-to-end model requires a comprehensive dataset; otherwise, the model exhibits poor generalization capabilities. Recently, large language models (LLMs) have been applied to enhance the generalization capabilities of end-to-end driving models. Most studies explore LLMs in an open-loop manner, where the output actions are compared to those of experts without direct feedback from the real world, while others examine closed-loop results only in simulations. This paper proposes an efficient architecture that integrates multimodal LLMs into end-to-end driving models operating in closed-loop settings in real-world environments. In our architecture, the LLM periodically processes raw sensor data to generate high-level driving instructions, effectively guiding the end-to-end model, even at a slower rate than the raw sensor data. This architecture relaxes the trade-off between the latency and inference quality of the LLM. It also allows us to choose from a wide variety of LLMs to improve high-level driving instructions and minimize fine-tuning costs. Consequently, our architecture reduces data collection requirements because the LLMs do not directly output actions; we only need to train a simple imitation learning model to output actions. In our experiments, the training data for the end-to-end model in a real-world environment consists of only simple obstacle configurations with one traffic cone, while the test environment is more complex and contains multiple obstacles placed in various positions. Experiments show that the proposed architecture enhances the generalization capabilities of the end-to-end model even without fine-tuning the LLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14256v1-abstract-full').style.display = 'none'; document.getElementById('2411.14256v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14228">arXiv:2411.14228</a> <span> [<a href="https://arxiv.org/pdf/2411.14228">pdf</a>, <a href="https://arxiv.org/format/2411.14228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FocusLLaVA: A Coarse-to-Fine Approach for Efficient and Effective Visual Token Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chi Xie</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shuang Liang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Sheng Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14228v1-abstract-short" style="display: inline;"> Recent advances on Multi-modal Large Language Models have demonstrated that high-resolution image input is crucial for model capabilities, especially for fine-grained tasks. However, high-resolution images lead to a quadratic increase in the number of visual tokens input into LLMs, resulting in significant computational costs. Current work develop visual token compression methods to achieve effici… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14228v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14228v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14228v1-abstract-full" style="display: none;"> Recent advances on Multi-modal Large Language Models have demonstrated that high-resolution image input is crucial for model capabilities, especially for fine-grained tasks. However, high-resolution images lead to a quadratic increase in the number of visual tokens input into LLMs, resulting in significant computational costs. Current work develop visual token compression methods to achieve efficiency improvements, often at the expense of performance. We argue that removing visual redundancy can simultaneously improve both efficiency and performance. We build a coarse-to-fine visual token compression method, with a vision-guided sampler for compressing redundant regions with low information density, and a text-guided sampler for selecting visual tokens that are strongly correlated with the user instructions.With these two modules, the proposed FocusLLaVA achieves improvements in both efficiency and performance. We validate the effectiveness of our approach on a wide range of evaluation datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14228v1-abstract-full').style.display = 'none'; document.getElementById('2411.14228v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12669">arXiv:2411.12669</a> <span> [<a href="https://arxiv.org/pdf/2411.12669">pdf</a>, <a href="https://arxiv.org/ps/2411.12669">ps</a>, <a href="https://arxiv.org/format/2411.12669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Constrained Coding and Deep Learning Aided Threshold Detection for Resistive Memories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+X">Xingwei Zhong</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+K">Kui Cai</a>, <a href="/search/cs?searchtype=author&query=Song%2C+G">Guanghui Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weijie Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12669v1-abstract-short" style="display: inline;"> Resistive random access memory (ReRAM) is a promising emerging non-volatile memory (NVM) technology that shows high potential for both data storage and computing. However, its crossbar array architecture leads to the sneak path problem, which may severely degrade the reliability of data stored in the ReRAM cell. Due to the complication of memory physics and unique features of the sneak path induce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12669v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12669v1-abstract-full" style="display: none;"> Resistive random access memory (ReRAM) is a promising emerging non-volatile memory (NVM) technology that shows high potential for both data storage and computing. However, its crossbar array architecture leads to the sneak path problem, which may severely degrade the reliability of data stored in the ReRAM cell. Due to the complication of memory physics and unique features of the sneak path induced interference (SPI), it is difficult to derive an accurate channel model for it. The deep learning (DL)-based detection scheme \cite{zhong2020sneakdl} can better mitigate the SPI, at the cost of additional power consumption and read latency. In this letter, we first propose a novel CC scheme which can not only reduce the SPI in the memory array, but also effectively differentiate the memory arrays into two categories of sneak-path-free and sneak-path-affected arrays. For the sneak-path-free arrays, we can use a simple middle-point threshold detector to detect the low and high resistance cells of ReRAM. For the sneak-path-affected arrays, a DL detector is first trained off-line (prior to the data detection of ReRAM). To avoid the additional power consumption and latency introduced by the DL detector, we further propose a DL-based threshold detector, whose detection threshold can be derived based on the outputs of the DL detector. It is then utilized for the online data detection of all the identified sneak-path-affected arrays. Simulation results demonstrate that the above CC and DL aided threshold detection scheme can effectively mitigate the SPI of the ReRAM array and achieve better error rate performance than the prior art detection schemes, without the prior knowledge of the channel. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12669v1-abstract-full').style.display = 'none'; document.getElementById('2411.12669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12359">arXiv:2411.12359</a> <span> [<a href="https://arxiv.org/pdf/2411.12359">pdf</a>, <a href="https://arxiv.org/format/2411.12359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> TactV: A Class of Hybrid Terrestrial/Aerial Coaxial Tilt-Rotor Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yifei Dong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yimin Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lixian Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yihang Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12359v1-abstract-short" style="display: inline;"> To enhance the obstacle-crossing and endurance capabilities of vehicles operating in complex environments, this paper presents the design of a hybrid terrestrial/aerial coaxial tilt-rotor vehicle, TactV, which integrates advantages such as lightweight construction and high maneuverability. Unlike existing tandem dual-rotor vehicles, TactV employs a tiltable coaxial dual-rotor design and features a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12359v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12359v1-abstract-full" style="display: none;"> To enhance the obstacle-crossing and endurance capabilities of vehicles operating in complex environments, this paper presents the design of a hybrid terrestrial/aerial coaxial tilt-rotor vehicle, TactV, which integrates advantages such as lightweight construction and high maneuverability. Unlike existing tandem dual-rotor vehicles, TactV employs a tiltable coaxial dual-rotor design and features a spherical cage structure that encases the body, allowing for omnidirectional movement while further reducing its overall dimensions. To enable TactV to maneuver flexibly in aerial, planar, and inclined surfaces, we established corresponding dynamic and control models for each mode. Additionally, we leveraged TactV's tiltable center of gravity to design energy-saving and high-mobility modes for ground operations, thereby further enhancing its endurance. Experimental designs for both aerial and ground tests corroborated the superiority of TactV's movement capabilities and control strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12359v1-abstract-full').style.display = 'none'; document.getElementById('2411.12359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11910">arXiv:2411.11910</a> <span> [<a href="https://arxiv.org/pdf/2411.11910">pdf</a>, <a href="https://arxiv.org/ps/2411.11910">ps</a>, <a href="https://arxiv.org/format/2411.11910">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AIGS: Generating Science from AI-Powered Automated Falsification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zijun Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kaiming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+X">Xuanyu Lei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zonghan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11910v2-abstract-short" style="display: inline;"> Rapid development of artificial intelligence has drastically accelerated the development of scientific discovery. Trained with large-scale observation data, deep neural networks extract the underlying patterns in an end-to-end manner and assist human researchers with highly-precised predictions in unseen scenarios. The recent rise of Large Language Models (LLMs) and the empowered autonomous agents… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11910v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11910v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11910v2-abstract-full" style="display: none;"> Rapid development of artificial intelligence has drastically accelerated the development of scientific discovery. Trained with large-scale observation data, deep neural networks extract the underlying patterns in an end-to-end manner and assist human researchers with highly-precised predictions in unseen scenarios. The recent rise of Large Language Models (LLMs) and the empowered autonomous agents enable scientists to gain help through interaction in different stages of their research, including but not limited to literature review, research ideation, idea implementation, and academic writing. However, AI researchers instantiated by foundation model empowered agents with full-process autonomy are still in their infancy. In this paper, we study $\textbf{AI-Generated Science}$ (AIGS), where agents independently and autonomously complete the entire research process and discover scientific laws. By revisiting the definition of scientific research, we argue that $\textit{falsification}$ is the essence of both human research process and the design of an AIGS system. Through the lens of falsification, prior systems attempting towards AI-Generated Science either lack the part in their design, or rely heavily on existing verification engines that narrow the use in specialized domains. In this work, we propose Baby-AIGS as a baby-step demonstration of a full-process AIGS system, which is a multi-agent system with agents in roles representing key research process. By introducing FalsificationAgent, which identify and then verify possible scientific discoveries, we empower the system with explicit falsification. Experiments on three tasks preliminarily show that Baby-AIGS could produce meaningful scientific discoveries, though not on par with experienced human researchers. Finally, we discuss on the limitations of current Baby-AIGS, actionable insights, and related ethical issues in detail. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11910v2-abstract-full').style.display = 'none'; document.getElementById('2411.11910v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Pre-print. 35 pages. Official website: https://agent-force.github.io/AIGS/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11326">arXiv:2411.11326</a> <span> [<a href="https://arxiv.org/pdf/2411.11326">pdf</a>, <a href="https://arxiv.org/format/2411.11326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.14778/3654621.3654629">10.14778/3654621.3654629 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Intelligent Pooling: Proactive Resource Provisioning in Large-scale Cloud Service </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ravikumar%2C+D">Deepak Ravikumar</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+A">Alex Yeo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiwen Zhu</a>, <a href="/search/cs?searchtype=author&query=Lakra%2C+A">Aditya Lakra</a>, <a href="/search/cs?searchtype=author&query=Nagulapalli%2C+H">Harsha Nagulapalli</a>, <a href="/search/cs?searchtype=author&query=Ravindran%2C+S+K">Santhosh Kumar Ravindran</a>, <a href="/search/cs?searchtype=author&query=Suh%2C+S">Steve Suh</a>, <a href="/search/cs?searchtype=author&query=Dutta%2C+N">Niharika Dutta</a>, <a href="/search/cs?searchtype=author&query=Fogarty%2C+A">Andrew Fogarty</a>, <a href="/search/cs?searchtype=author&query=Park%2C+Y">Yoonjae Park</a>, <a href="/search/cs?searchtype=author&query=Khushalani%2C+S">Sumeet Khushalani</a>, <a href="/search/cs?searchtype=author&query=Tarafdar%2C+A">Arijit Tarafdar</a>, <a href="/search/cs?searchtype=author&query=Parekh%2C+K">Kunal Parekh</a>, <a href="/search/cs?searchtype=author&query=Krishnan%2C+S">Subru Krishnan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11326v1-abstract-short" style="display: inline;"> The proliferation of big data and analytic workloads has driven the need for cloud compute and cluster-based job processing. With Apache Spark, users can process terabytes of data at ease with hundreds of parallel executors. At Microsoft, we aim at providing a fast and succinct interface for users to run Spark applications, such as through creating simple notebook "sessions" by abstracting the und… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11326v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11326v1-abstract-full" style="display: none;"> The proliferation of big data and analytic workloads has driven the need for cloud compute and cluster-based job processing. With Apache Spark, users can process terabytes of data at ease with hundreds of parallel executors. At Microsoft, we aim at providing a fast and succinct interface for users to run Spark applications, such as through creating simple notebook "sessions" by abstracting the underlying complexity of the cloud. Providing low latency access to Spark clusters and sessions is a challenging problem due to the large overheads of cluster creation and session startup. In this paper, we introduce Intelligent Pooling, a system for proactively provisioning compute resources to combat the aforementioned overheads. To reduce the COGS (cost-of-goods-sold), our system (1) predicts usage patterns using an innovative hybrid Machine Learning (ML) model with low latency and high accuracy; and (2) optimizes the pool size dynamically to meet customer demand while reducing extraneous COGS. The proposed system auto-tunes its hyper-parameters to balance between performance and operational cost with minimal to no engineering input. Evaluated using large-scale production data, Intelligent Pooling achieves up to 43% reduction in cluster idle time compared to static pooling when targeting 99% pool hit rate. Currently deployed in production, Intelligent Pooling is on track to save tens of million dollars in COGS per year as compared to traditional pre-provisioned pools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11326v1-abstract-full').style.display = 'none'; document.getElementById('2411.11326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the VLDB Endowment, Vol. 17, No. 7 ISSN 2150-8097, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11325">arXiv:2411.11325</a> <span> [<a href="https://arxiv.org/pdf/2411.11325">pdf</a>, <a href="https://arxiv.org/format/2411.11325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3654952">10.1145/3654952 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Lorentz: Learned SKU Recommendation Using Profile Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Glaze%2C+N">Nicholas Glaze</a>, <a href="/search/cs?searchtype=author&query=McNeely%2C+T">Tria McNeely</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiwen Zhu</a>, <a href="/search/cs?searchtype=author&query=Gleeson%2C+M">Matthew Gleeson</a>, <a href="/search/cs?searchtype=author&query=Serr%2C+H">Helen Serr</a>, <a href="/search/cs?searchtype=author&query=Bhopi%2C+R">Rajeev Bhopi</a>, <a href="/search/cs?searchtype=author&query=Krishnan%2C+S">Subru Krishnan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11325v1-abstract-short" style="display: inline;"> Cloud operators have expanded their service offerings, known as Stock Keeping Units (SKUs), to accommodate diverse demands, resulting in increased complexity for customers to select appropriate configurations. In a studied system, only 43% of the resource capacity was correctly chosen. Automated solutions addressing this issue often require enriched data, such as workload traces, which are unavail… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11325v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11325v1-abstract-full" style="display: none;"> Cloud operators have expanded their service offerings, known as Stock Keeping Units (SKUs), to accommodate diverse demands, resulting in increased complexity for customers to select appropriate configurations. In a studied system, only 43% of the resource capacity was correctly chosen. Automated solutions addressing this issue often require enriched data, such as workload traces, which are unavailable for new services. However, telemetry from existing users and customer satisfaction feedback provide valuable insights for understanding customer needs and improving provisioning recommendations. This paper introduces Lorentz, an intelligent SKU recommender for provisioning compute resources without relying on workload traces. Lorentz uses customer profile data to forecast resource capacities for new users by profiling existing ones. It also incorporates a continuous feedback loop to refine recommendations based on customer performance versus cost preferences inferred from satisfaction signals. Validated with production data from Azure PostgreSQL DB, Lorentz achieves over 60% slack reduction without increasing throttling compared to user selections and existing defaults. Evaluations with synthetic data demonstrate Lorentz's ability to iteratively learn user preferences with high accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11325v1-abstract-full').style.display = 'none'; document.getElementById('2411.11325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. ACM Manag. Data, Vol. 2, No. 3 (SIGMOD), Article 149. Publication date: June 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11188">arXiv:2411.11188</a> <span> [<a href="https://arxiv.org/pdf/2411.11188">pdf</a>, <a href="https://arxiv.org/format/2411.11188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AMAGO-2: Breaking the Multi-Task Barrier in Meta-Reinforcement Learning with Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grigsby%2C+J">Jake Grigsby</a>, <a href="/search/cs?searchtype=author&query=Sasek%2C+J">Justin Sasek</a>, <a href="/search/cs?searchtype=author&query=Parajuli%2C+S">Samyak Parajuli</a>, <a href="/search/cs?searchtype=author&query=Adebi%2C+D">Daniel Adebi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Amy Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11188v1-abstract-short" style="display: inline;"> Language models trained on diverse datasets unlock generalization by in-context learning. Reinforcement Learning (RL) policies can achieve a similar effect by meta-learning within the memory of a sequence model. However, meta-RL research primarily focuses on adapting to minor variations of a single task. It is difficult to scale towards more general behavior without confronting challenges in multi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11188v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11188v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11188v1-abstract-full" style="display: none;"> Language models trained on diverse datasets unlock generalization by in-context learning. Reinforcement Learning (RL) policies can achieve a similar effect by meta-learning within the memory of a sequence model. However, meta-RL research primarily focuses on adapting to minor variations of a single task. It is difficult to scale towards more general behavior without confronting challenges in multi-task optimization, and few solutions are compatible with meta-RL's goal of learning from large training sets of unlabeled tasks. To address this challenge, we revisit the idea that multi-task RL is bottlenecked by imbalanced training losses created by uneven return scales across different tasks. We build upon recent advancements in Transformer-based (in-context) meta-RL and evaluate a simple yet scalable solution where both an agent's actor and critic objectives are converted to classification terms that decouple optimization from the current scale of returns. Large-scale comparisons in Meta-World ML45, Multi-Game Procgen, Multi-Task POPGym, Multi-Game Atari, and BabyAI find that this design unlocks significant progress in online multi-task adaptation and memory problems without explicit task labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11188v1-abstract-full').style.display = 'none'; document.getElementById('2411.11188v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11166">arXiv:2411.11166</a> <span> [<a href="https://arxiv.org/pdf/2411.11166">pdf</a>, <a href="https://arxiv.org/format/2411.11166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3649217.3653575">10.1145/3649217.3653575 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Early Adoption of Generative Artificial Intelligence in Computing Education: Emergent Student Use Cases and Perspectives in 2023 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Smith%2C+C+E">C. Estelle Smith</a>, <a href="/search/cs?searchtype=author&query=Shiekh%2C+K">Kylee Shiekh</a>, <a href="/search/cs?searchtype=author&query=Cooreman%2C+H">Hayden Cooreman</a>, <a href="/search/cs?searchtype=author&query=Rahman%2C+S">Sharfi Rahman</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yifei Zhu</a>, <a href="/search/cs?searchtype=author&query=Siam%2C+M+K">Md Kamrul Siam</a>, <a href="/search/cs?searchtype=author&query=Ivanitskiy%2C+M">Michael Ivanitskiy</a>, <a href="/search/cs?searchtype=author&query=Ahmed%2C+A+M">Ahmed M. Ahmed</a>, <a href="/search/cs?searchtype=author&query=Hallinan%2C+M">Michael Hallinan</a>, <a href="/search/cs?searchtype=author&query=Grisak%2C+A">Alexander Grisak</a>, <a href="/search/cs?searchtype=author&query=Fierro%2C+G">Gabe Fierro</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11166v1-abstract-short" style="display: inline;"> Because of the rapid development and increasing public availability of Generative Artificial Intelligence (GenAI) models and tools, educational institutions and educators must immediately reckon with the impact of students using GenAI. There is limited prior research on computing students' use and perceptions of GenAI. In anticipation of future advances and evolutions of GenAI, we capture a snapsh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11166v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11166v1-abstract-full" style="display: none;"> Because of the rapid development and increasing public availability of Generative Artificial Intelligence (GenAI) models and tools, educational institutions and educators must immediately reckon with the impact of students using GenAI. There is limited prior research on computing students' use and perceptions of GenAI. In anticipation of future advances and evolutions of GenAI, we capture a snapshot of student attitudes towards and uses of yet emerging GenAI, in a period of time before university policies had reacted to these technologies. We surveyed all computer science majors in a small engineering-focused R1 university in order to: (1) capture a baseline assessment of how GenAI has been immediately adopted by aspiring computer scientists; (2) describe computing students' GenAI-related needs and concerns for their education and careers; and (3) discuss GenAI influences on CS pedagogy, curriculum, culture, and policy. We present an exploratory qualitative analysis of this data and discuss the impact of our findings on the emerging conversation around GenAI and education. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11166v1-abstract-full').style.display = 'none'; document.getElementById('2411.11166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11070">arXiv:2411.11070</a> <span> [<a href="https://arxiv.org/pdf/2411.11070">pdf</a>, <a href="https://arxiv.org/ps/2411.11070">ps</a>, <a href="https://arxiv.org/format/2411.11070">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Joint Precoding and AP Selection for Energy Efficient RIS-aided Cell-Free Massive MIMO Using Multi-agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+E">Enyu Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiayi Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiyang Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+D+W+K">Derrick Wing Kwan Ng</a>, <a href="/search/cs?searchtype=author&query=Di+Renzo%2C+M">Marco Di Renzo</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+B">Bo Ai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11070v1-abstract-short" style="display: inline;"> Cell-free (CF) massive multiple-input multiple-output (mMIMO) and reconfigurable intelligent surface (RIS) are two advanced transceiver technologies for realizing future sixth-generation (6G) networks. In this paper, we investigate the joint precoding and access point (AP) selection for energy efficient RIS-aided CF mMIMO system. To address the associated computational complexity and communication… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11070v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11070v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11070v1-abstract-full" style="display: none;"> Cell-free (CF) massive multiple-input multiple-output (mMIMO) and reconfigurable intelligent surface (RIS) are two advanced transceiver technologies for realizing future sixth-generation (6G) networks. In this paper, we investigate the joint precoding and access point (AP) selection for energy efficient RIS-aided CF mMIMO system. To address the associated computational complexity and communication power consumption, we advocate for user-centric dynamic networks in which each user is served by a subset of APs rather than by all of them. Based on the user-centric network, we formulate a joint precoding and AP selection problem to maximize the energy efficiency (EE) of the considered system. To solve this complex nonconvex problem, we propose an innovative double-layer multi-agent reinforcement learning (MARL)-based scheme. Moreover, we propose an adaptive power threshold-based AP selection scheme to further enhance the EE of the considered system. To reduce the computational complexity of the RIS-aided CF mMIMO system, we introduce a fuzzy logic (FL) strategy into the MARL scheme to accelerate convergence. The simulation results show that the proposed FL-based MARL cooperative architecture effectively improves EE performance, offering a 85\% enhancement over the zero-forcing (ZF) method, and achieves faster convergence speed compared with MARL. It is important to note that increasing the transmission power of the APs or the number of RIS elements can effectively enhance the spectral efficiency (SE) performance, which also leads to an increase in power consumption, resulting in a non-trivial trade-off between the quality of service and EE performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11070v1-abstract-full').style.display = 'none'; document.getElementById('2411.11070v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10364">arXiv:2411.10364</a> <span> [<a href="https://arxiv.org/pdf/2411.10364">pdf</a>, <a href="https://arxiv.org/format/2411.10364">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Forming Auxiliary High-confident Instance-level Loss to Promote Learning from Label Proportions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tianhao Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Han Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Juncheng Hu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yungang Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Ximing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10364v1-abstract-short" style="display: inline;"> Learning from label proportions (LLP), i.e., a challenging weakly-supervised learning task, aims to train a classifier by using bags of instances and the proportions of classes within bags, rather than annotated labels for each instance. Beyond the traditional bag-level loss, the mainstream methodology of LLP is to incorporate an auxiliary instance-level loss with pseudo-labels formed by predictio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10364v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10364v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10364v1-abstract-full" style="display: none;"> Learning from label proportions (LLP), i.e., a challenging weakly-supervised learning task, aims to train a classifier by using bags of instances and the proportions of classes within bags, rather than annotated labels for each instance. Beyond the traditional bag-level loss, the mainstream methodology of LLP is to incorporate an auxiliary instance-level loss with pseudo-labels formed by predictions. Unfortunately, we empirically observed that the pseudo-labels are are often inaccurate due to over-smoothing, especially for the scenarios with large bag sizes, hurting the classifier induction. To alleviate this problem, we suggest a novel LLP method, namely Learning from Label Proportions with Auxiliary High-confident Instance-level Loss (L^2P-AHIL). Specifically, we propose a dual entropy-based weight (DEW) method to adaptively measure the confidences of pseudo-labels. It simultaneously emphasizes accurate predictions at the bag level and avoids overly smoothed predictions. We then form high-confident instance-level loss with DEW, and jointly optimize it with the bag-level loss in a self-training manner. The experimental results on benchmark datasets show that L^2P-AHIL can surpass the existing baseline methods, and the performance gain can be more significant as the bag size increases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10364v1-abstract-full').style.display = 'none'; document.getElementById('2411.10364v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09154">arXiv:2411.09154</a> <span> [<a href="https://arxiv.org/pdf/2411.09154">pdf</a>, <a href="https://arxiv.org/ps/2411.09154">ps</a>, <a href="https://arxiv.org/format/2411.09154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> STAR-RIS Enabled ISAC Systems: Joint Rate Splitting and Beamforming Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Ruihong Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yongdong Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Huimin Hu</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Q">Qiang Ni</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+Z">Zesong Fei</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09154v1-abstract-short" style="display: inline;"> This paper delves into an integrated sensing and communication (ISAC) system bolstered by a simultaneously transmitting and reflecting reconfigurable intelligent surface (STAR-RIS). Within this system, a base station (BS) is equipped with communication and radar capabilities, enabling it to communicate with ground terminals (GTs) and concurrently probe for echo signals from a target of interest. M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09154v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09154v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09154v1-abstract-full" style="display: none;"> This paper delves into an integrated sensing and communication (ISAC) system bolstered by a simultaneously transmitting and reflecting reconfigurable intelligent surface (STAR-RIS). Within this system, a base station (BS) is equipped with communication and radar capabilities, enabling it to communicate with ground terminals (GTs) and concurrently probe for echo signals from a target of interest. Moreover, to manage interference and improve communication quality, the rate splitting multiple access (RSMA) scheme is incorporated into the system. The signal-to-interference-plus-noise ratio (SINR) of the received sensing echo signals is a measure of sensing performance. We formulate a joint optimization problem of common rates, transmit beamforming at the BS, and passive beamforming vectors of the STAR-RIS. The objective is to maximize sensing SINR while guaranteeing the communication rate requirements for each GT. We present an iterative algorithm to address the non-convex problem by invoking Dinkelbach's transform, semidefinite relaxation (SDR), majorization-minimization, and sequential rank-one constraint relaxation (SROCR) theories. Simulation results manifest that the performance of the studied ISAC network enhanced by the STAR-RIS and RSMA surpasses other benchmarks considerably. The results evidently indicate the superior performance improvement of the ISAC system with the proposed RSMA-based transmission strategy design and the dynamic optimization of both transmission and reflection beamforming at STAR-RIS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09154v1-abstract-full').style.display = 'none'; document.getElementById('2411.09154v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09153">arXiv:2411.09153</a> <span> [<a href="https://arxiv.org/pdf/2411.09153">pdf</a>, <a href="https://arxiv.org/format/2411.09153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> VidMan: Exploiting Implicit Dynamics from Video Diffusion Model for Effective Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Youpeng Wen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junfan Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yi Zhu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jianhua Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shen Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09153v1-abstract-short" style="display: inline;"> Recent advancements utilizing large-scale video data for learning video generation models demonstrate significant potential in understanding complex physical dynamics. It suggests the feasibility of leveraging diverse robot trajectory data to develop a unified, dynamics-aware model to enhance robot manipulation. However, given the relatively small amount of available robot data, directly fitting d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09153v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09153v1-abstract-full" style="display: none;"> Recent advancements utilizing large-scale video data for learning video generation models demonstrate significant potential in understanding complex physical dynamics. It suggests the feasibility of leveraging diverse robot trajectory data to develop a unified, dynamics-aware model to enhance robot manipulation. However, given the relatively small amount of available robot data, directly fitting data without considering the relationship between visual observations and actions could lead to suboptimal data utilization. To this end, we propose VidMan (Video Diffusion for Robot Manipulation), a novel framework that employs a two-stage training mechanism inspired by dual-process theory from neuroscience to enhance stability and improve data utilization efficiency. Specifically, in the first stage, VidMan is pre-trained on the Open X-Embodiment dataset (OXE) for predicting future visual trajectories in a video denoising diffusion manner, enabling the model to develop a long horizontal awareness of the environment's dynamics. In the second stage, a flexible yet effective layer-wise self-attention adapter is introduced to transform VidMan into an efficient inverse dynamics model that predicts action modulated by the implicit dynamics knowledge via parameter sharing. Our VidMan framework outperforms state-of-the-art baseline model GR-1 on the CALVIN benchmark, achieving a 11.7% relative improvement, and demonstrates over 9% precision gains on the OXE small-scale dataset. These results provide compelling evidence that world models can significantly enhance the precision of robot action prediction. Codes and models will be public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09153v1-abstract-full').style.display = 'none'; document.getElementById('2411.09153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08599">arXiv:2411.08599</a> <span> [<a href="https://arxiv.org/pdf/2411.08599">pdf</a>, <a href="https://arxiv.org/format/2411.08599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yingqi Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoxia Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaorong Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yin Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiming Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiqi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yuntao Hong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhiling Luo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jinyang Gao</a>, <a href="/search/cs?searchtype=author&query=Mou%2C+L">Liyu Mou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08599v1-abstract-short" style="display: inline;"> To tackle the challenges of large language model performance in natural language to SQL tasks, we introduce XiYan-SQL, an innovative framework that employs a multi-generator ensemble strategy to improve candidate generation. We introduce M-Schema, a semi-structured schema representation method designed to enhance the understanding of database structures. To enhance the quality and diversity of gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08599v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08599v1-abstract-full" style="display: none;"> To tackle the challenges of large language model performance in natural language to SQL tasks, we introduce XiYan-SQL, an innovative framework that employs a multi-generator ensemble strategy to improve candidate generation. We introduce M-Schema, a semi-structured schema representation method designed to enhance the understanding of database structures. To enhance the quality and diversity of generated candidate SQL queries, XiYan-SQL integrates the significant potential of in-context learning (ICL) with the precise control of supervised fine-tuning. On one hand, we propose a series of training strategies to fine-tune models to generate high-quality candidates with diverse preferences. On the other hand, we implement the ICL approach with an example selection method based on named entity recognition to prevent overemphasis on entities. The refiner optimizes each candidate by correcting logical or syntactical errors. To address the challenge of identifying the best candidate, we fine-tune a selection model to distinguish nuances of candidate SQL queries. The experimental results on multiple dialect datasets demonstrate the robustness of XiYan-SQL in addressing challenges across different scenarios. Overall, our proposed XiYan-SQL achieves the state-of-the-art execution accuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on NL2GQL, and a competitive score of 72.23% on the Bird development benchmark. The proposed framework not only enhances the quality and diversity of SQL queries but also outperforms previous methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08599v1-abstract-full').style.display = 'none'; document.getElementById('2411.08599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; H.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05318">arXiv:2411.05318</a> <span> [<a href="https://arxiv.org/pdf/2411.05318">pdf</a>, <a href="https://arxiv.org/format/2411.05318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Fairness in Monotone $k$-submodular Maximization: Algorithms and Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanhui Zhu</a>, <a href="/search/cs?searchtype=author&query=Basu%2C+S">Samik Basu</a>, <a href="/search/cs?searchtype=author&query=Pavan%2C+A">A. Pavan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05318v1-abstract-short" style="display: inline;"> Submodular optimization has become increasingly prominent in machine learning and fairness has drawn much attention. In this paper, we propose to study the fair $k$-submodular maximization problem and develop a $\frac{1}{3}$-approximation greedy algorithm with a running time of $\mathcal{O}(knB)$. To the best of our knowledge, our work is the first to incorporate fairness in the context of $k$-sub… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05318v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05318v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05318v1-abstract-full" style="display: none;"> Submodular optimization has become increasingly prominent in machine learning and fairness has drawn much attention. In this paper, we propose to study the fair $k$-submodular maximization problem and develop a $\frac{1}{3}$-approximation greedy algorithm with a running time of $\mathcal{O}(knB)$. To the best of our knowledge, our work is the first to incorporate fairness in the context of $k$-submodular maximization, and our theoretical guarantee matches the best-known $k$-submodular maximization results without fairness constraints. In addition, we have developed a faster threshold-based algorithm that achieves a $(\frac{1}{3} - 蔚)$ approximation with $\mathcal{O}(\frac{kn}蔚 \log \frac{B}蔚)$ evaluations of the function $f$. Furthermore, for both algorithms, we provide approximation guarantees when the $k$-submodular function is not accessible but only can be approximately accessed. We have extensively validated our theoretical findings through empirical research and examined the practical implications of fairness. Specifically, we have addressed the question: ``What is the price of fairness?" through case studies on influence maximization with $k$ topics and sensor placement with $k$ types. The experimental results show that the fairness constraints do not significantly undermine the quality of solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05318v1-abstract-full').style.display = 'none'; document.getElementById('2411.05318v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages. To appear in IEEE BigData 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05223">arXiv:2411.05223</a> <span> [<a href="https://arxiv.org/pdf/2411.05223">pdf</a>, <a href="https://arxiv.org/format/2411.05223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Single-Source Cross-modality Medical Image Segmentation via Invariant Causal Mechanisms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boqi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuanzhi Zhu</a>, <a href="/search/cs?searchtype=author&query=Ao%2C+Y">Yunke Ao</a>, <a href="/search/cs?searchtype=author&query=Caprara%2C+S">Sebastiano Caprara</a>, <a href="/search/cs?searchtype=author&query=Sutter%2C+R">Reto Sutter</a>, <a href="/search/cs?searchtype=author&query=R%C3%A4tsch%2C+G">Gunnar R盲tsch</a>, <a href="/search/cs?searchtype=author&query=Konukoglu%2C+E">Ender Konukoglu</a>, <a href="/search/cs?searchtype=author&query=Susmelj%2C+A">Anna Susmelj</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05223v1-abstract-short" style="display: inline;"> Single-source domain generalization (SDG) aims to learn a model from a single source domain that can generalize well on unseen target domains. This is an important task in computer vision, particularly relevant to medical imaging where domain shifts are common. In this work, we consider a challenging yet practical setting: SDG for cross-modality medical image segmentation. We combine causality-ins… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05223v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05223v1-abstract-full" style="display: none;"> Single-source domain generalization (SDG) aims to learn a model from a single source domain that can generalize well on unseen target domains. This is an important task in computer vision, particularly relevant to medical imaging where domain shifts are common. In this work, we consider a challenging yet practical setting: SDG for cross-modality medical image segmentation. We combine causality-inspired theoretical insights on learning domain-invariant representations with recent advancements in diffusion-based augmentation to improve generalization across diverse imaging modalities. Guided by the ``intervention-augmentation equivariant'' principle, we use controlled diffusion models (DMs) to simulate diverse imaging styles while preserving the content, leveraging rich generative priors in large-scale pretrained DMs to comprehensively perturb the multidimensional style variable. Extensive experiments on challenging cross-modality segmentation tasks demonstrate that our approach consistently outperforms state-of-the-art SDG methods across three distinct anatomies and imaging modalities. The source code is available at \href{https://github.com/ratschlab/ICMSeg}{https://github.com/ratschlab/ICMSeg}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05223v1-abstract-full').style.display = 'none'; document.getElementById('2411.05223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04826">arXiv:2411.04826</a> <span> [<a href="https://arxiv.org/pdf/2411.04826">pdf</a>, <a href="https://arxiv.org/format/2411.04826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> D$^3$epth: Self-Supervised Depth Estimation with Dynamic Mask in Dynamic Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hong Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenhao Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Ying Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoquan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianbing Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04826v1-abstract-short" style="display: inline;"> Depth estimation is a crucial technology in robotics. Recently, self-supervised depth estimation methods have demonstrated great potential as they can efficiently leverage large amounts of unlabelled real-world data. However, most existing methods are designed under the assumption of static scenes, which hinders their adaptability in dynamic environments. To address this issue, we present D$^3$ept… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04826v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04826v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04826v1-abstract-full" style="display: none;"> Depth estimation is a crucial technology in robotics. Recently, self-supervised depth estimation methods have demonstrated great potential as they can efficiently leverage large amounts of unlabelled real-world data. However, most existing methods are designed under the assumption of static scenes, which hinders their adaptability in dynamic environments. To address this issue, we present D$^3$epth, a novel method for self-supervised depth estimation in dynamic scenes. It tackles the challenge of dynamic objects from two key perspectives. First, within the self-supervised framework, we design a reprojection constraint to identify regions likely to contain dynamic objects, allowing the construction of a dynamic mask that mitigates their impact at the loss level. Second, for multi-frame depth estimation, we introduce a cost volume auto-masking strategy that leverages adjacent frames to identify regions associated with dynamic objects and generate corresponding masks. This provides guidance for subsequent processes. Furthermore, we propose a spectral entropy uncertainty module that incorporates spectral entropy to guide uncertainty estimation during depth fusion, effectively addressing issues arising from cost volume computation in dynamic environments. Extensive experiments on KITTI and Cityscapes datasets demonstrate that the proposed method consistently outperforms existing self-supervised monocular depth estimation baselines. Code is available at \url{https://github.com/Csyunling/D3epth}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04826v1-abstract-full').style.display = 'none'; document.getElementById('2411.04826v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Open sourced</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04158">arXiv:2411.04158</a> <span> [<a href="https://arxiv.org/pdf/2411.04158">pdf</a>, <a href="https://arxiv.org/format/2411.04158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-2288">10.21437/Interspeech.2024-2288 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Analyzing Multimodal Features of Spontaneous Voice Assistant Commands for Mild Cognitive Impairment Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+N">Nana Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Youxiang Zhu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaohui Liang</a>, <a href="/search/cs?searchtype=author&query=Batsis%2C+J+A">John A. Batsis</a>, <a href="/search/cs?searchtype=author&query=Summerour%2C+C">Caroline Summerour</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04158v1-abstract-short" style="display: inline;"> Mild cognitive impairment (MCI) is a major public health concern due to its high risk of progressing to dementia. This study investigates the potential of detecting MCI with spontaneous voice assistant (VA) commands from 35 older adults in a controlled setting. Specifically, a command-generation task is designed with pre-defined intents for participants to freely generate commands that are more as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04158v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04158v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04158v1-abstract-full" style="display: none;"> Mild cognitive impairment (MCI) is a major public health concern due to its high risk of progressing to dementia. This study investigates the potential of detecting MCI with spontaneous voice assistant (VA) commands from 35 older adults in a controlled setting. Specifically, a command-generation task is designed with pre-defined intents for participants to freely generate commands that are more associated with cognitive ability than read commands. We develop MCI classification and regression models with audio, textual, intent, and multimodal fusion features. We find the command-generation task outperforms the command-reading task with an average classification accuracy of 82%, achieved by leveraging multimodal fusion features. In addition, generated commands correlate more strongly with memory and attention subdomains than read commands. Our results confirm the effectiveness of the command-generation task and imply the promise of using longitudinal in-home commands for MCI detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04158v1-abstract-full').style.display = 'none'; document.getElementById('2411.04158v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03859">arXiv:2411.03859</a> <span> [<a href="https://arxiv.org/pdf/2411.03859">pdf</a>, <a href="https://arxiv.org/format/2411.03859">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> </div> <p class="title is-5 mathjax"> UniTraj: Learning a Universal Trajectory Foundation Model from Billion-Scale Worldwide Traces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuanshao Zhu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J+J">James Jianqiao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xuetao Wei</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yuxuan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03859v2-abstract-short" style="display: inline;"> Human trajectory modeling is essential for deciphering movement patterns and supporting advanced applications across various domains. However, existing methods are often tailored to specific tasks and regions, resulting in limitations related to task specificity, regional dependency, and data quality sensitivity. Addressing these challenges requires a universal human trajectory foundation model ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03859v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03859v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03859v2-abstract-full" style="display: none;"> Human trajectory modeling is essential for deciphering movement patterns and supporting advanced applications across various domains. However, existing methods are often tailored to specific tasks and regions, resulting in limitations related to task specificity, regional dependency, and data quality sensitivity. Addressing these challenges requires a universal human trajectory foundation model capable of generalizing and scaling across diverse tasks and geographic contexts. To this end, we propose UniTraj, a Universal human Trajectory foundation model that is task-adaptive, region-independent, and highly generalizable. To further enhance performance, we construct WorldTrace, the first large-scale, high-quality, globally distributed dataset sourced from open web platforms, encompassing 2.45 million trajectories with billions of points across 70 countries. Through multiple resampling and masking strategies designed for pre-training, UniTraj effectively overcomes geographic and task constraints, adapting to heterogeneous data quality. Extensive experiments across multiple trajectory analysis tasks and real-world datasets demonstrate that UniTraj consistently outperforms existing approaches in terms of scalability and adaptability. These results underscore the potential of UniTraj as a versatile, robust solution for a wide range of trajectory analysis applications, with WorldTrace serving as an ideal but non-exclusive foundation for training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03859v2-abstract-full').style.display = 'none'; document.getElementById('2411.03859v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03817">arXiv:2411.03817</a> <span> [<a href="https://arxiv.org/pdf/2411.03817">pdf</a>, <a href="https://arxiv.org/format/2411.03817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> From Novice to Expert: LLM Agent Policy Optimization via Step-wise Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhirui Deng</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhicheng Dou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yutao Zhu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+R">Ruibin Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mang Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weipeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03817v2-abstract-short" style="display: inline;"> The outstanding capabilities of large language models (LLMs) render them a crucial component in various autonomous agent systems. While traditional methods depend on the inherent knowledge of LLMs without fine-tuning, more recent approaches have shifted toward the reinforcement learning strategy to further enhance agents' ability to solve complex interactive tasks with environments and tools. Howe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03817v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03817v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03817v2-abstract-full" style="display: none;"> The outstanding capabilities of large language models (LLMs) render them a crucial component in various autonomous agent systems. While traditional methods depend on the inherent knowledge of LLMs without fine-tuning, more recent approaches have shifted toward the reinforcement learning strategy to further enhance agents' ability to solve complex interactive tasks with environments and tools. However, previous approaches are constrained by the sparse reward issue, where existing datasets solely provide a final scalar reward for each multi-step reasoning chain, potentially leading to ineffectiveness and inefficiency in policy learning. In this paper, we introduce StepAgent, which utilizes step-wise reward to optimize the agent's reinforcement learning process. Inheriting the spirit of novice-to-expert theory, we first compare the actions of the expert and the agent to automatically generate intermediate rewards for fine-grained optimization. Additionally, we propose implicit-reward and inverse reinforcement learning techniques to facilitate agent reflection and policy adjustment. Further theoretical analysis demonstrates that the action distribution of the agent can converge toward the expert action distribution over multiple training cycles. Experimental results across various datasets indicate that StepAgent outperforms existing baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03817v2-abstract-full').style.display = 'none'; document.getElementById('2411.03817v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03682">arXiv:2411.03682</a> <span> [<a href="https://arxiv.org/pdf/2411.03682">pdf</a>, <a href="https://arxiv.org/format/2411.03682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LEGATO: Cross-Embodiment Imitation Using a Grasping Tool </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Seo%2C+M">Mingyo Seo</a>, <a href="/search/cs?searchtype=author&query=Park%2C+H+A">H. Andy Park</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shenli Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a>, <a href="/search/cs?searchtype=author&query=Sentis%2C+L">Luis Sentis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03682v1-abstract-short" style="display: inline;"> Cross-embodiment imitation learning enables policies trained on specific embodiments to transfer across different robots, unlocking the potential for large-scale imitation learning that is both cost-effective and highly reusable. This paper presents LEGATO, a cross-embodiment imitation learning framework for visuomotor skill transfer across varied kinematic morphologies. We introduce a handheld gr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03682v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03682v1-abstract-full" style="display: none;"> Cross-embodiment imitation learning enables policies trained on specific embodiments to transfer across different robots, unlocking the potential for large-scale imitation learning that is both cost-effective and highly reusable. This paper presents LEGATO, a cross-embodiment imitation learning framework for visuomotor skill transfer across varied kinematic morphologies. We introduce a handheld gripper that unifies action and observation spaces, allowing tasks to be defined consistently across robots. Using this gripper, we train visuomotor policies via imitation learning, applying a motion-invariant transformation to compute the training loss. Gripper motions are then retargeted into high-degree-of-freedom whole-body motions using inverse kinematics for deployment across diverse embodiments. Our evaluations in simulation and real-robot experiments highlight the framework's effectiveness in learning and transferring visuomotor skills across various robots. More information can be found at the project page: https://ut-hcrl.github.io/LEGATO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03682v1-abstract-full').style.display = 'none'; document.getElementById('2411.03682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to RA-L</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03677">arXiv:2411.03677</a> <span> [<a href="https://arxiv.org/pdf/2411.03677">pdf</a>, <a href="https://arxiv.org/format/2411.03677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Physical Layer Deception in OFDM Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenwen Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bin Han</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yao Zhu</a>, <a href="/search/cs?searchtype=author&query=Schmeink%2C+A">Anke Schmeink</a>, <a href="/search/cs?searchtype=author&query=Schotten%2C+H+D">Hans D. Schotten</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03677v1-abstract-short" style="display: inline;"> As a promising technology, physical layer security (PLS) enhances security by leveraging the physical characteristics of communication channels. However, the conventional PLS approach leads to a considerable disparity in the effort legitimate users need to secure data compared to eavesdroppers. To address this issue, we propose a physical layer deception (PLD) framework, which applies random decep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03677v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03677v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03677v1-abstract-full" style="display: none;"> As a promising technology, physical layer security (PLS) enhances security by leveraging the physical characteristics of communication channels. However, the conventional PLS approach leads to a considerable disparity in the effort legitimate users need to secure data compared to eavesdroppers. To address this issue, we propose a physical layer deception (PLD) framework, which applies random deceptive ciphering and orthogonal frequency-division multiplexing (OFDM) to defend against eavesdropping proactively. While ensuring the same level of confidentiality as traditional PLS methods, the PLD approach additionally introduces a deception mechanism, even when the eavesdropper possesses the same knowledge about the transmitter end as the legitimate receiver. Through thorough theoretical analyses and numerical simulations, we prove the superiority of our method over the conventional PLS approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03677v1-abstract-full').style.display = 'none'; document.getElementById('2411.03677v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICC 2025 (appendices are excluded from the submitted version due to length limit)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03665">arXiv:2411.03665</a> <span> [<a href="https://arxiv.org/pdf/2411.03665">pdf</a>, <a href="https://arxiv.org/format/2411.03665">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Moral Beliefs across LLMs through a Pluralistic Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuelin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanfei Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shucheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Ying Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03665v1-abstract-short" style="display: inline;"> Proper moral beliefs are fundamental for language models, yet assessing these beliefs poses a significant challenge. This study introduces a novel three-module framework to evaluate the moral beliefs of four prominent large language models. Initially, we constructed a dataset containing 472 moral choice scenarios in Chinese, derived from moral words. The decision-making process of the models in th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03665v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03665v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03665v1-abstract-full" style="display: none;"> Proper moral beliefs are fundamental for language models, yet assessing these beliefs poses a significant challenge. This study introduces a novel three-module framework to evaluate the moral beliefs of four prominent large language models. Initially, we constructed a dataset containing 472 moral choice scenarios in Chinese, derived from moral words. The decision-making process of the models in these scenarios reveals their moral principle preferences. By ranking these moral choices, we discern the varying moral beliefs held by different language models. Additionally, through moral debates, we investigate the firmness of these models to their moral choices. Our findings indicate that English language models, namely ChatGPT and Gemini, closely mirror moral decisions of the sample of Chinese university students, demonstrating strong adherence to their choices and a preference for individualistic moral beliefs. In contrast, Chinese models such as Ernie and ChatGLM lean towards collectivist moral beliefs, exhibiting ambiguity in their moral choices and debates. This study also uncovers gender bias embedded within the moral beliefs of all examined language models. Our methodology offers an innovative means to assess moral beliefs in both artificial and human intelligence, facilitating a comparison of moral values across different cultures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03665v1-abstract-full').style.display = 'none'; document.getElementById('2411.03665v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03603">arXiv:2411.03603</a> <span> [<a href="https://arxiv.org/pdf/2411.03603">pdf</a>, <a href="https://arxiv.org/format/2411.03603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> CPEG: Leveraging Consistency Policy with Consensus Guidance for Multi-agent Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yuqian Fu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuanheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoran Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zijie Zhao</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+J">Jiajun Chai</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dongbin Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03603v1-abstract-short" style="display: inline;"> Efficient exploration is crucial in cooperative multi-agent reinforcement learning (MARL), especially in sparse-reward settings. However, due to the reliance on the unimodal policy, existing methods are prone to falling into the local optima, hindering the effective exploration of better policies. Furthermore, tackling multi-agent tasks in complex environments requires cooperation during explorati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03603v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03603v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03603v1-abstract-full" style="display: none;"> Efficient exploration is crucial in cooperative multi-agent reinforcement learning (MARL), especially in sparse-reward settings. However, due to the reliance on the unimodal policy, existing methods are prone to falling into the local optima, hindering the effective exploration of better policies. Furthermore, tackling multi-agent tasks in complex environments requires cooperation during exploration, posing substantial challenges for MARL methods. To address these issues, we propose a Consistency Policy with consEnsus Guidance (CPEG), with two primary components: (a) introducing a multimodal policy to enhance exploration capabilities, and (b) sharing the consensus among agents to foster agent cooperation. For component (a), CPEG incorporates a Consistency model as the policy, leveraging its multimodal nature and stochastic characteristics to facilitate exploration. Regarding component (b), CPEG introduces a Consensus Learner to deduce the consensus on the global state from local observations. This consensus then serves as a guidance for the Consistency Policy, promoting cooperation among agents. The proposed method is evaluated in multi-agent particle environments (MPE) and multi-agent MuJoCo (MAMuJoCo), and empirical results indicate that CPEG not only achieves improvements in sparse-reward settings but also matches the performance of baselines in dense-reward environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03603v1-abstract-full').style.display = 'none'; document.getElementById('2411.03603v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03531">arXiv:2411.03531</a> <span> [<a href="https://arxiv.org/pdf/2411.03531">pdf</a>, <a href="https://arxiv.org/format/2411.03531">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3627673.3680011">10.1145/3627673.3680011 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Personalized Video Summarization by Multimodal Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Brian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yingnan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03531v1-abstract-short" style="display: inline;"> Video summarization techniques have been proven to improve the overall user experience when it comes to accessing and comprehending video content. If the user's preference is known, video summarization can identify significant information or relevant content from an input video, aiding them in obtaining the necessary information or determining their interest in watching the original video. Adaptin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03531v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03531v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03531v1-abstract-full" style="display: none;"> Video summarization techniques have been proven to improve the overall user experience when it comes to accessing and comprehending video content. If the user's preference is known, video summarization can identify significant information or relevant content from an input video, aiding them in obtaining the necessary information or determining their interest in watching the original video. Adapting video summarization to various types of video and user preferences requires significant training data and expensive human labeling. To facilitate such research, we proposed a new benchmark for video summarization that captures various user preferences. Also, we present a pipeline called Video Summarization with Language (VSL) for user-preferred video summarization that is based on pre-trained visual language models (VLMs) to avoid the need to train a video summarization system on a large training dataset. The pipeline takes both video and closed captioning as input and performs semantic analysis at the scene level by converting video frames into text. Subsequently, the user's genre preference was used as the basis for selecting the pertinent textual scenes. The experimental results demonstrate that our proposed pipeline outperforms current state-of-the-art unsupervised video summarization models. We show that our method is more adaptable across different datasets compared to supervised query-based video summarization models. In the end, the runtime analysis demonstrates that our pipeline is more suitable for practical use when scaling up the number of user preferences and videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03531v1-abstract-full').style.display = 'none'; document.getElementById('2411.03531v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proceedings of CIKM 2024 Applied Research Track</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 33rd ACM International Conference on Information and Knowledge Management (CIKM 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03129">arXiv:2411.03129</a> <span> [<a href="https://arxiv.org/pdf/2411.03129">pdf</a>, <a href="https://arxiv.org/format/2411.03129">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biological Physics">physics.bio-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MA^2: A Self-Supervised and Motion Augmenting Autoencoder for Gait-Based Automatic Disease Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Ke Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03129v1-abstract-short" style="display: inline;"> Ground reaction force (GRF) is the force exerted by the ground on a body in contact with it. GRF-based automatic disease detection (ADD) has become an emerging medical diagnosis method, which aims to learn and identify disease patterns corresponding to different gait pressures based on deep learning methods. Although existing ADD methods can save doctors time in making diagnoses, training deep mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03129v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03129v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03129v1-abstract-full" style="display: none;"> Ground reaction force (GRF) is the force exerted by the ground on a body in contact with it. GRF-based automatic disease detection (ADD) has become an emerging medical diagnosis method, which aims to learn and identify disease patterns corresponding to different gait pressures based on deep learning methods. Although existing ADD methods can save doctors time in making diagnoses, training deep models still struggles with the cost caused by the labeling engineering for a large number of gait diagnostic data for subjects. On the other hand, the accuracy of the deep model under the unified benchmark GRF dataset and the generalization ability on scalable gait datasets need to be further improved. To address these issues, we propose MA2, a GRF-based self-supervised and motion augmenting auto-encoder, which models the ADD task as an encoder-decoder paradigm. In the encoder, we introduce an embedding block including the 3-layer 1D convolution for extracting the token and a mask generator to randomly mask out the sequence of tokens to maximize the model's potential to capture high-level, discriminative, intrinsic representations. whereafter, the decoder utilizes this information to reconstruct the pixel sequence of the origin input and calculate the reconstruction loss to optimize the network. Moreover, the backbone of an auto-encoder is multi-head self-attention that can consider the global information of the token from the input, not just the local neighborhood. This allows the model to capture generalized contextual information. Extensive experiments demonstrate MA2 has SOTA performance of 90.91% accuracy on 1% limited pathological GRF samples with labels, and good generalization ability of 78.57% accuracy on scalable Parkinson disease dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03129v1-abstract-full').style.display = 'none'; document.getElementById('2411.03129v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 11 figures, article</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02704">arXiv:2411.02704</a> <span> [<a href="https://arxiv.org/pdf/2411.02704">pdf</a>, <a href="https://arxiv.org/format/2411.02704">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RT-Affordance: Affordances are Versatile Intermediate Representations for Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nasiriany%2C+S">Soroush Nasiriany</a>, <a href="/search/cs?searchtype=author&query=Kirmani%2C+S">Sean Kirmani</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tianli Ding</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+L">Laura Smith</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a>, <a href="/search/cs?searchtype=author&query=Driess%2C+D">Danny Driess</a>, <a href="/search/cs?searchtype=author&query=Sadigh%2C+D">Dorsa Sadigh</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Ted Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02704v1-abstract-short" style="display: inline;"> We explore how intermediate policy representations can facilitate generalization by providing guidance on how to perform manipulation tasks. Existing representations such as language, goal images, and trajectory sketches have been shown to be helpful, but these representations either do not provide enough context or provide over-specified context that yields less robust policies. We propose condit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02704v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02704v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02704v1-abstract-full" style="display: none;"> We explore how intermediate policy representations can facilitate generalization by providing guidance on how to perform manipulation tasks. Existing representations such as language, goal images, and trajectory sketches have been shown to be helpful, but these representations either do not provide enough context or provide over-specified context that yields less robust policies. We propose conditioning policies on affordances, which capture the pose of the robot at key stages of the task. Affordances offer expressive yet lightweight abstractions, are easy for users to specify, and facilitate efficient learning by transferring knowledge from large internet datasets. Our method, RT-Affordance, is a hierarchical model that first proposes an affordance plan given the task language, and then conditions the policy on this affordance plan to perform manipulation. Our model can flexibly bridge heterogeneous sources of supervision including large web datasets and robot trajectories. We additionally train our model on cheap-to-collect in-domain affordance images, allowing us to learn new tasks without collecting any additional costly robot trajectories. We show on a diverse set of novel tasks how RT-Affordance exceeds the performance of existing methods by over 50%, and we empirically demonstrate that affordances are robust to novel settings. Videos available at https://snasiriany.me/rt-affordance <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02704v1-abstract-full').style.display = 'none'; document.getElementById('2411.02704v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02120">arXiv:2411.02120</a> <span> [<a href="https://arxiv.org/pdf/2411.02120">pdf</a>, <a href="https://arxiv.org/format/2411.02120">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Bridge-IF: Learning Inverse Protein Folding with Markov Bridges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jialu Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qiuyi Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jiahuan Yan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Mingze Yin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingyang Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02120v1-abstract-short" style="display: inline;"> Inverse protein folding is a fundamental task in computational protein design, which aims to design protein sequences that fold into the desired backbone structures. While the development of machine learning algorithms for this task has seen significant success, the prevailing approaches, which predominantly employ a discriminative formulation, frequently encounter the error accumulation issue and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02120v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02120v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02120v1-abstract-full" style="display: none;"> Inverse protein folding is a fundamental task in computational protein design, which aims to design protein sequences that fold into the desired backbone structures. While the development of machine learning algorithms for this task has seen significant success, the prevailing approaches, which predominantly employ a discriminative formulation, frequently encounter the error accumulation issue and often fail to capture the extensive variety of plausible sequences. To fill these gaps, we propose Bridge-IF, a generative diffusion bridge model for inverse folding, which is designed to learn the probabilistic dependency between the distributions of backbone structures and protein sequences. Specifically, we harness an expressive structure encoder to propose a discrete, informative prior derived from structures, and establish a Markov bridge to connect this prior with native sequences. During the inference stage, Bridge-IF progressively refines the prior sequence, culminating in a more plausible design. Moreover, we introduce a reparameterization perspective on Markov bridge models, from which we derive a simplified loss function that facilitates more effective training. We also modulate protein language models (PLMs) with structural conditions to precisely approximate the Markov bridge process, thereby significantly enhancing generation performance while maintaining parameter-efficient training. Extensive experiments on well-established benchmarks demonstrate that Bridge-IF predominantly surpasses existing baselines in sequence recovery and excels in the design of plausible proteins with high foldability. The code is available at https://github.com/violet-sto/Bridge-IF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02120v1-abstract-full').style.display = 'none'; document.getElementById('2411.02120v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02038">arXiv:2411.02038</a> <span> [<a href="https://arxiv.org/pdf/2411.02038">pdf</a>, <a href="https://arxiv.org/format/2411.02038">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Addressing Representation Collapse in Vector Quantized Models with One Linear Layer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yongxin Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bocheng Li</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Y">Yifei Xin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Linli Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02038v1-abstract-short" style="display: inline;"> Vector Quantization (VQ) is a widely used method for converting continuous representations into discrete codes, which has become fundamental in unsupervised representation learning and latent generative models. However, VQ models are often hindered by the problem of representation collapse in the latent space, which leads to low codebook utilization and limits the scalability of the codebook for l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02038v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02038v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02038v1-abstract-full" style="display: none;"> Vector Quantization (VQ) is a widely used method for converting continuous representations into discrete codes, which has become fundamental in unsupervised representation learning and latent generative models. However, VQ models are often hindered by the problem of representation collapse in the latent space, which leads to low codebook utilization and limits the scalability of the codebook for large-scale training. Existing methods designed to mitigate representation collapse typically reduce the dimensionality of latent space at the expense of model capacity, which do not fully resolve the core issue. In this study, we conduct a theoretical analysis of representation collapse in VQ models and identify its primary cause as the disjoint optimization of the codebook, where only a small subset of code vectors are updated through gradient descent. To address this issue, we propose \textbf{SimVQ}, a novel method which reparameterizes the code vectors through a linear transformation layer based on a learnable latent basis. This transformation optimizes the \textit{entire linear space} spanned by the codebook, rather than merely updating \textit{the code vector} selected by the nearest-neighbor search in vanilla VQ models. Although it is commonly understood that the multiplication of two linear matrices is equivalent to applying a single linear layer, our approach works surprisingly well in resolving the collapse issue in VQ models with just one linear layer. We validate the efficacy of SimVQ through extensive experiments across various modalities, including image and audio data with different model architectures. Our code is available at \url{https://github.com/youngsheen/SimVQ}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02038v1-abstract-full').style.display = 'none'; document.getElementById('2411.02038v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01169">arXiv:2411.01169</a> <span> [<a href="https://arxiv.org/pdf/2411.01169">pdf</a>, <a href="https://arxiv.org/format/2411.01169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TKDE.2024.3397683">10.1109/TKDE.2024.3397683 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Bi-Level Graph Structure Learning for Next POI Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shu Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanqiao Zhu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xiang Tao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengdi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01169v1-abstract-short" style="display: inline;"> Next point-of-interest (POI) recommendation aims to predict a user's next destination based on sequential check-in history and a set of POI candidates. Graph neural networks (GNNs) have demonstrated a remarkable capability in this endeavor by exploiting the extensive global collaborative signals present among POIs. However, most of the existing graph-based approaches construct graph structures bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01169v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01169v1-abstract-full" style="display: none;"> Next point-of-interest (POI) recommendation aims to predict a user's next destination based on sequential check-in history and a set of POI candidates. Graph neural networks (GNNs) have demonstrated a remarkable capability in this endeavor by exploiting the extensive global collaborative signals present among POIs. However, most of the existing graph-based approaches construct graph structures based on pre-defined heuristics, failing to consider inherent hierarchical structures of POI features such as geographical locations and visiting peaks, or suffering from noisy and incomplete structures in graphs. To address the aforementioned issues, this paper presents a novel Bi-level Graph Structure Learning (BiGSL) for next POI recommendation. BiGSL first learns a hierarchical graph structure to capture the fine-to-coarse connectivity between POIs and prototypes, and then uses a pairwise learning module to dynamically infer relationships between POI pairs and prototype pairs. Based on the learned bi-level graphs, our model then employs a multi-relational graph network that considers both POI- and prototype-level neighbors, resulting in improved POI representations. Our bi-level structure learning scheme is more robust to data noise and incompleteness, and improves the exploration ability for recommendation by alleviating sparsity issues. Experimental results on three real-world datasets demonstrate the superiority of our model over existing state-of-the-art methods, with a significant improvement in recommendation accuracy and exploration performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01169v1-abstract-full').style.display = 'none'; document.getElementById('2411.01169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Knowledge and Data Engineering</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Knowledge and Data Engineering, vol. 36, no. 11, pp. 5695-5708, Nov. 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00965">arXiv:2411.00965</a> <span> [<a href="https://arxiv.org/pdf/2411.00965">pdf</a>, <a href="https://arxiv.org/format/2411.00965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SPOT: SE(3) Pose Trajectory Diffusion for Object-Centric Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hsu%2C+C">Cheng-Chun Hsu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bowen Wen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jie Xu</a>, <a href="/search/cs?searchtype=author&query=Narang%2C+Y">Yashraj Narang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a>, <a href="/search/cs?searchtype=author&query=Biswas%2C+J">Joydeep Biswas</a>, <a href="/search/cs?searchtype=author&query=Birchfield%2C+S">Stan Birchfield</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00965v1-abstract-short" style="display: inline;"> We introduce SPOT, an object-centric imitation learning framework. The key idea is to capture each task by an object-centric representation, specifically the SE(3) object pose trajectory relative to the target. This approach decouples embodiment actions from sensory inputs, facilitating learning from various demonstration types, including both action-based and action-less human hand demonstrations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00965v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00965v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00965v1-abstract-full" style="display: none;"> We introduce SPOT, an object-centric imitation learning framework. The key idea is to capture each task by an object-centric representation, specifically the SE(3) object pose trajectory relative to the target. This approach decouples embodiment actions from sensory inputs, facilitating learning from various demonstration types, including both action-based and action-less human hand demonstrations, as well as cross-embodiment generalization. Additionally, object pose trajectories inherently capture planning constraints from demonstrations without the need for manually crafted rules. To guide the robot in executing the task, the object trajectory is used to condition a diffusion policy. We show improvement compared to prior work on RLBench simulated tasks. In real-world evaluation, using only eight demonstrations shot on an iPhone, our approach completed all tasks while fully complying with task constraints. Project page: https://nvlabs.github.io/object_centric_diffusion <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00965v1-abstract-full').style.display = 'none'; document.getElementById('2411.00965v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00491">arXiv:2411.00491</a> <span> [<a href="https://arxiv.org/pdf/2411.00491">pdf</a>, <a href="https://arxiv.org/format/2411.00491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GDTB: Genre Diverse Data for English Shallow Discourse Parsing across Modalities, Text Types, and Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y+J">Yang Janet Liu</a>, <a href="/search/cs?searchtype=author&query=Aoyama%2C+T">Tatsuya Aoyama</a>, <a href="/search/cs?searchtype=author&query=Scivetti%2C+W">Wesley Scivetti</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yilun Zhu</a>, <a href="/search/cs?searchtype=author&query=Behzad%2C+S">Shabnam Behzad</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+L+E">Lauren Elizabeth Levine</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jessica Lin</a>, <a href="/search/cs?searchtype=author&query=Tiwari%2C+D">Devika Tiwari</a>, <a href="/search/cs?searchtype=author&query=Zeldes%2C+A">Amir Zeldes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00491v1-abstract-short" style="display: inline;"> Work on shallow discourse parsing in English has focused on the Wall Street Journal corpus, the only large-scale dataset for the language in the PDTB framework. However, the data is not openly available, is restricted to the news domain, and is by now 35 years old. In this paper, we present and evaluate a new open-access, multi-genre benchmark for PDTB-style shallow discourse parsing, based on the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00491v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00491v1-abstract-full" style="display: none;"> Work on shallow discourse parsing in English has focused on the Wall Street Journal corpus, the only large-scale dataset for the language in the PDTB framework. However, the data is not openly available, is restricted to the news domain, and is by now 35 years old. In this paper, we present and evaluate a new open-access, multi-genre benchmark for PDTB-style shallow discourse parsing, based on the existing UD English GUM corpus, for which discourse relation annotations in other frameworks already exist. In a series of experiments on cross-domain relation classification, we show that while our dataset is compatible with PDTB, substantial out-of-domain degradation is observed, which can be alleviated by joint training on both datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00491v1-abstract-full').style.display = 'none'; document.getElementById('2411.00491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024 (main, long); camera-ready version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00079">arXiv:2411.00079</a> <span> [<a href="https://arxiv.org/pdf/2411.00079">pdf</a>, <a href="https://arxiv.org/format/2411.00079">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Label Noise: Ignorance Is Bliss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yilun Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Gangrade%2C+A">Aditya Gangrade</a>, <a href="/search/cs?searchtype=author&query=Scott%2C+C">Clayton Scott</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00079v1-abstract-short" style="display: inline;"> We establish a new theoretical framework for learning under multi-class, instance-dependent label noise. This framework casts learning with label noise as a form of domain adaptation, in particular, domain adaptation under posterior drift. We introduce the concept of \emph{relative signal strength} (RSS), a pointwise measure that quantifies the transferability from noisy to clean posterior. Using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00079v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00079v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00079v1-abstract-full" style="display: none;"> We establish a new theoretical framework for learning under multi-class, instance-dependent label noise. This framework casts learning with label noise as a form of domain adaptation, in particular, domain adaptation under posterior drift. We introduce the concept of \emph{relative signal strength} (RSS), a pointwise measure that quantifies the transferability from noisy to clean posterior. Using RSS, we establish nearly matching upper and lower bounds on the excess risk. Our theoretical findings support the simple \emph{Noise Ignorant Empirical Risk Minimization (NI-ERM)} principle, which minimizes empirical risk while ignoring label noise. Finally, we translate this theoretical insight into practice: by using NI-ERM to fit a linear classifier on top of a self-supervised feature extractor, we achieve state-of-the-art performance on the CIFAR-N data challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00079v1-abstract-full').style.display = 'none'; document.getElementById('2411.00079v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24185">arXiv:2410.24185</a> <span> [<a href="https://arxiv.org/pdf/2410.24185">pdf</a>, <a href="https://arxiv.org/format/2410.24185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DexMimicGen: Automated Data Generation for Bimanual Dexterous Manipulation via Imitation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhenyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuqi Xie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kevin Lin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhenjia Xu</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+W">Weikang Wan</a>, <a href="/search/cs?searchtype=author&query=Mandlekar%2C+A">Ajay Mandlekar</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Linxi Fan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24185v1-abstract-short" style="display: inline;"> Imitation learning from human demonstrations is an effective means to teach robots manipulation skills. But data acquisition is a major bottleneck in applying this paradigm more broadly, due to the amount of cost and human effort involved. There has been significant interest in imitation learning for bimanual dexterous robots, like humanoids. Unfortunately, data collection is even more challenging… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24185v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24185v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24185v1-abstract-full" style="display: none;"> Imitation learning from human demonstrations is an effective means to teach robots manipulation skills. But data acquisition is a major bottleneck in applying this paradigm more broadly, due to the amount of cost and human effort involved. There has been significant interest in imitation learning for bimanual dexterous robots, like humanoids. Unfortunately, data collection is even more challenging here due to the challenges of simultaneously controlling multiple arms and multi-fingered hands. Automated data generation in simulation is a compelling, scalable alternative to fuel this need for data. To this end, we introduce DexMimicGen, a large-scale automated data generation system that synthesizes trajectories from a handful of human demonstrations for humanoid robots with dexterous hands. We present a collection of simulation environments in the setting of bimanual dexterous manipulation, spanning a range of manipulation behaviors and different requirements for coordination among the two arms. We generate 21K demos across these tasks from just 60 source human demos and study the effect of several data generation and policy learning decisions on agent performance. Finally, we present a real-to-sim-to-real pipeline and deploy it on a real-world humanoid can sorting task. Videos and more are at https://dexmimicgen.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24185v1-abstract-full').style.display = 'none'; document.getElementById('2410.24185v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://dexmimicgen.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23857">arXiv:2410.23857</a> <span> [<a href="https://arxiv.org/pdf/2410.23857">pdf</a>, <a href="https://arxiv.org/format/2410.23857">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> ECDQC: Efficient Compilation for Distributed Quantum Computing with Linear Layout </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kecheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yidong Zhou</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haochen Luo</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+L">Lingjun Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuchen Zhu</a>, <a href="/search/cs?searchtype=author&query=Casey%2C+E">Eilis Casey</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jinglei Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S+Y">Samuel Yen-Chi Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhiding Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23857v2-abstract-short" style="display: inline;"> In this paper, we propose an efficient compilation method for distributed quantum computing (DQC) using the Linear Nearest Neighbor (LNN) architecture. By exploiting the LNN topology's symmetry, we optimize quantum circuit compilation for High Local Connectivity, Sparse Full Connectivity (HLC-SFC) algorithms like Quantum Approximate Optimization Algorithm (QAOA) and Quantum Fourier Transform (QFT)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23857v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23857v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23857v2-abstract-full" style="display: none;"> In this paper, we propose an efficient compilation method for distributed quantum computing (DQC) using the Linear Nearest Neighbor (LNN) architecture. By exploiting the LNN topology's symmetry, we optimize quantum circuit compilation for High Local Connectivity, Sparse Full Connectivity (HLC-SFC) algorithms like Quantum Approximate Optimization Algorithm (QAOA) and Quantum Fourier Transform (QFT). We also utilize dangling qubits to minimize non-local interactions and reduce SWAP gates. Our approach significantly decreases compilation time, gate count, and circuit depth, improving scalability and robustness for large-scale quantum computations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23857v2-abstract-full').style.display = 'none'; document.getElementById('2410.23857v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23855">arXiv:2410.23855</a> <span> [<a href="https://arxiv.org/pdf/2410.23855">pdf</a>, <a href="https://arxiv.org/format/2410.23855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> RAGraph: A General Retrieval-Augmented Graph Learning Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xinke Jiang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+R">Rihong Qiu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yongxin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yichen Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruizhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuchen Fang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xu Chu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junfeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yasha Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23855v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have become essential in interpreting relational data across various domains, yet, they often struggle to generalize to unseen graph data that differs markedly from training instances. In this paper, we introduce a novel framework called General Retrieval-Augmented Graph Learning (RAGraph), which brings external graph data into the general graph foundation model to imp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23855v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23855v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23855v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have become essential in interpreting relational data across various domains, yet, they often struggle to generalize to unseen graph data that differs markedly from training instances. In this paper, we introduce a novel framework called General Retrieval-Augmented Graph Learning (RAGraph), which brings external graph data into the general graph foundation model to improve model generalization on unseen scenarios. On the top of our framework is a toy graph vector library that we established, which captures key attributes, such as features and task-specific label information. During inference, the RAGraph adeptly retrieves similar toy graphs based on key similarities in downstream tasks, integrating the retrieved data to enrich the learning context via the message-passing prompting mechanism. Our extensive experimental evaluations demonstrate that RAGraph significantly outperforms state-of-the-art graph learning methods in multiple tasks such as node classification, link prediction, and graph classification across both dynamic and static datasets. Furthermore, extensive testing confirms that RAGraph consistently maintains high performance without the need for task-specific fine-tuning, highlighting its adaptability, robustness, and broad applicability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23855v1-abstract-full').style.display = 'none'; document.getElementById('2410.23855v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23788">arXiv:2410.23788</a> <span> [<a href="https://arxiv.org/pdf/2410.23788">pdf</a>, <a href="https://arxiv.org/format/2410.23788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EDT: An Efficient Diffusion Transformer Framework Inspired by Human-like Sketching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinwang Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ning Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yichen Zhu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Feifei Feng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23788v1-abstract-short" style="display: inline;"> Transformer-based Diffusion Probabilistic Models (DPMs) have shown more potential than CNN-based DPMs, yet their extensive computational requirements hinder widespread practical applications. To reduce the computation budget of transformer-based DPMs, this work proposes the Efficient Diffusion Transformer (EDT) framework. The framework includes a lightweight-design diffusion model architecture, an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23788v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23788v1-abstract-full" style="display: none;"> Transformer-based Diffusion Probabilistic Models (DPMs) have shown more potential than CNN-based DPMs, yet their extensive computational requirements hinder widespread practical applications. To reduce the computation budget of transformer-based DPMs, this work proposes the Efficient Diffusion Transformer (EDT) framework. The framework includes a lightweight-design diffusion model architecture, and a training-free Attention Modulation Matrix and its alternation arrangement in EDT inspired by human-like sketching. Additionally, we propose a token relation-enhanced masking training strategy tailored explicitly for EDT to augment its token relation learning capability. Our extensive experiments demonstrate the efficacy of EDT. The EDT framework reduces training and inference costs and surpasses existing transformer-based diffusion models in image synthesis performance, thereby achieving a significant overall enhancement. With lower FID, EDT-S, EDT-B, and EDT-XL attained speed-ups of 3.93x, 2.84x, and 1.92x respectively in the training phase, and 2.29x, 2.29x, and 2.22x respectively in inference, compared to the corresponding sizes of MDTv2. The source code is released at https://github.com/xinwangChen/EDT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23788v1-abstract-full').style.display = 'none'; document.getElementById('2410.23788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Xinwang Chen and Ning Liu are with equal contributions. This paper has been accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23769">arXiv:2410.23769</a> <span> [<a href="https://arxiv.org/pdf/2410.23769">pdf</a>, <a href="https://arxiv.org/format/2410.23769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Potential of LLMs in Medical Education: Generating Questions and Answers for Qualification Exams </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yunqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wen Tang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Ying Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuebing Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23769v1-abstract-short" style="display: inline;"> Recent research on large language models (LLMs) has primarily focused on their adaptation and application in specialized domains. The application of LLMs in the medical field is mainly concentrated on tasks such as the automation of medical report generation, summarization, diagnostic reasoning, and question-and-answer interactions between doctors and patients. The challenge of becoming a good tea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23769v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23769v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23769v1-abstract-full" style="display: none;"> Recent research on large language models (LLMs) has primarily focused on their adaptation and application in specialized domains. The application of LLMs in the medical field is mainly concentrated on tasks such as the automation of medical report generation, summarization, diagnostic reasoning, and question-and-answer interactions between doctors and patients. The challenge of becoming a good teacher is more formidable than that of becoming a good student, and this study pioneers the application of LLMs in the field of medical education. In this work, we investigate the extent to which LLMs can generate medical qualification exam questions and corresponding answers based on few-shot prompts. Utilizing a real-world Chinese dataset of elderly chronic diseases, we tasked the LLMs with generating open-ended questions and answers based on a subset of sampled admission reports across eight widely used LLMs, including ERNIE 4, ChatGLM 4, Doubao, Hunyuan, Spark 4, Qwen, Llama 3, and Mistral. Furthermore, we engaged medical experts to manually evaluate these open-ended questions and answers across multiple dimensions. The study found that LLMs, after using few-shot prompts, can effectively mimic real-world medical qualification exam questions, whereas there is room for improvement in the correctness, evidence-based statements, and professionalism of the generated answers. Moreover, LLMs also demonstrate a decent level of ability to correct and rectify reference answers. Given the immense potential of artificial intelligence in the medical field, the task of generating questions and answers for medical qualification exams aimed at medical students, interns and residents can be a significant focus of future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23769v1-abstract-full').style.display = 'none'; document.getElementById('2410.23769v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23039">arXiv:2410.23039</a> <span> [<a href="https://arxiv.org/pdf/2410.23039">pdf</a>, <a href="https://arxiv.org/format/2410.23039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neural Attention Field: Emerging Point Relevance in 3D Scenes for One-Shot Dexterous Grasping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianxu Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Congyue Deng</a>, <a href="/search/cs?searchtype=author&query=Lum%2C+T+G+W">Tyler Ga Wei Lum</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuanpei Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a>, <a href="/search/cs?searchtype=author&query=Bohg%2C+J">Jeannette Bohg</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yixin Zhu</a>, <a href="/search/cs?searchtype=author&query=Guibas%2C+L">Leonidas Guibas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23039v1-abstract-short" style="display: inline;"> One-shot transfer of dexterous grasps to novel scenes with object and context variations has been a challenging problem. While distilled feature fields from large vision models have enabled semantic correspondences across 3D scenes, their features are point-based and restricted to object surfaces, limiting their capability of modeling complex semantic feature distributions for hand-object interact… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23039v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23039v1-abstract-full" style="display: none;"> One-shot transfer of dexterous grasps to novel scenes with object and context variations has been a challenging problem. While distilled feature fields from large vision models have enabled semantic correspondences across 3D scenes, their features are point-based and restricted to object surfaces, limiting their capability of modeling complex semantic feature distributions for hand-object interactions. In this work, we propose the \textit{neural attention field} for representing semantic-aware dense feature fields in the 3D space by modeling inter-point relevance instead of individual point features. Core to it is a transformer decoder that computes the cross-attention between any 3D query point with all the scene points, and provides the query point feature with an attention-based aggregation. We further propose a self-supervised framework for training the transformer decoder from only a few 3D pointclouds without hand demonstrations. Post-training, the attention field can be applied to novel scenes for semantics-aware dexterous grasping from one-shot demonstration. Experiments show that our method provides better optimization landscapes by encouraging the end-effector to focus on task-relevant scene regions, resulting in significant improvements in success rates on real robots compared with the feature-field-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23039v1-abstract-full').style.display = 'none'; document.getElementById('2410.23039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22691">arXiv:2410.22691</a> <span> [<a href="https://arxiv.org/pdf/2410.22691">pdf</a>, <a href="https://arxiv.org/format/2410.22691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LRA.2024.3487516">10.1109/LRA.2024.3487516 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MiniTac: An Ultra-Compact 8 mm Vision-Based Tactile Sensor for Enhanced Palpation in Robot-Assisted Minimally Invasive Surgery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wanlin Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zihang Zhao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+L">Leiyao Cui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hangxin Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li-An Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yixin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22691v1-abstract-short" style="display: inline;"> Robot-assisted minimally invasive surgery (RAMIS) provides substantial benefits over traditional open and laparoscopic methods. However, a significant limitation of RAMIS is the surgeon's inability to palpate tissues, a crucial technique for examining tissue properties and detecting abnormalities, restricting the widespread adoption of RAMIS. To overcome this obstacle, we introduce MiniTac, a nove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22691v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22691v1-abstract-full" style="display: none;"> Robot-assisted minimally invasive surgery (RAMIS) provides substantial benefits over traditional open and laparoscopic methods. However, a significant limitation of RAMIS is the surgeon's inability to palpate tissues, a crucial technique for examining tissue properties and detecting abnormalities, restricting the widespread adoption of RAMIS. To overcome this obstacle, we introduce MiniTac, a novel vision-based tactile sensor with an ultra-compact cross-sectional diameter of 8 mm, designed for seamless integration into mainstream RAMIS devices, particularly the Da Vinci surgical systems. MiniTac features a novel mechanoresponsive photonic elastomer membrane that changes color distribution under varying contact pressures. This color change is captured by an embedded miniature camera, allowing MiniTac to detect tumors both on the tissue surface and in deeper layers typically obscured from endoscopic view. MiniTac's efficacy has been rigorously tested on both phantoms and ex-vivo tissues. By leveraging advanced mechanoresponsive photonic materials, MiniTac represents a significant advancement in integrating tactile sensing into RAMIS, potentially expanding its applicability to a wider array of clinical scenarios that currently rely on traditional surgical approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22691v1-abstract-full').style.display = 'none'; document.getElementById('2410.22691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted for publication in the IEEE Robotics and Automation Letters (RA-L)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22689">arXiv:2410.22689</a> <span> [<a href="https://arxiv.org/pdf/2410.22689">pdf</a>, <a href="https://arxiv.org/format/2410.22689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-Task Interactive Robot Fleet Learning with Visual World Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huihan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Betala%2C+V">Vaarij Betala</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+E">Evan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">James Liu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Crystal Ding</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22689v1-abstract-short" style="display: inline;"> Recent advancements in large-scale multi-task robot learning offer the potential for deploying robot fleets in household and industrial settings, enabling them to perform diverse tasks across various environments. However, AI-enabled robots often face challenges with generalization and robustness when exposed to real-world variability and uncertainty. We introduce Sirius-Fleet, a multi-task intera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22689v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22689v1-abstract-full" style="display: none;"> Recent advancements in large-scale multi-task robot learning offer the potential for deploying robot fleets in household and industrial settings, enabling them to perform diverse tasks across various environments. However, AI-enabled robots often face challenges with generalization and robustness when exposed to real-world variability and uncertainty. We introduce Sirius-Fleet, a multi-task interactive robot fleet learning framework to address these challenges. Sirius-Fleet monitors robot performance during deployment and involves humans to correct the robot's actions when necessary. We employ a visual world model to predict the outcomes of future actions and build anomaly predictors to predict whether they will likely result in anomalies. As the robot autonomy improves, the anomaly predictors automatically adapt their prediction criteria, leading to fewer requests for human intervention and gradually reducing human workload over time. Evaluations on large-scale benchmarks demonstrate Sirius-Fleet's effectiveness in improving multi-task policy performance and monitoring accuracy. We demonstrate Sirius-Fleet's performance in both RoboCasa in simulation and Mutex in the real world, two diverse, large-scale multi-task benchmarks. More information is available on the project website: https://ut-austin-rpl.github.io/sirius-fleet <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22689v1-abstract-full').style.display = 'none'; document.getElementById('2410.22689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proceedings of CoRL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22105">arXiv:2410.22105</a> <span> [<a href="https://arxiv.org/pdf/2410.22105">pdf</a>, <a href="https://arxiv.org/format/2410.22105">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DAGE: DAG Query Answering via Relational Combinator with Logical Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yunjie He</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+B">Bo Xiong</a>, <a href="/search/cs?searchtype=author&query=Hern%C3%A1ndez%2C+D">Daniel Hern谩ndez</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuqicheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Kharlamov%2C+E">Evgeny Kharlamov</a>, <a href="/search/cs?searchtype=author&query=Staab%2C+S">Steffen Staab</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22105v1-abstract-short" style="display: inline;"> Predicting answers to queries over knowledge graphs is called a complex reasoning task because answering a query requires subdividing it into subqueries. Existing query embedding methods use this decomposition to compute the embedding of a query as the combination of the embedding of the subqueries. This requirement limits the answerable queries to queries having a single free variable and being d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22105v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22105v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22105v1-abstract-full" style="display: none;"> Predicting answers to queries over knowledge graphs is called a complex reasoning task because answering a query requires subdividing it into subqueries. Existing query embedding methods use this decomposition to compute the embedding of a query as the combination of the embedding of the subqueries. This requirement limits the answerable queries to queries having a single free variable and being decomposable, which are called tree-form queries and correspond to the $\mathcal{SROI}^-$ description logic. In this paper, we define a more general set of queries, called DAG queries and formulated in the $\mathcal{ALCOIR}$ description logic, propose a query embedding method for them, called DAGE, and a new benchmark to evaluate query embeddings on them. Given the computational graph of a DAG query, DAGE combines the possibly multiple paths between two nodes into a single path with a trainable operator that represents the intersection of relations and learns DAG-DL from tautologies. We show that it is possible to implement DAGE on top of existing query embedding methods, and we empirically measure the improvement of our method over the results of vanilla methods evaluated in tree-form queries that approximate the DAG queries of our proposed benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22105v1-abstract-full').style.display = 'none'; document.getElementById('2410.22105v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21257">arXiv:2410.21257</a> <span> [<a href="https://arxiv.org/pdf/2410.21257">pdf</a>, <a href="https://arxiv.org/format/2410.21257">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> One-Step Diffusion Policy: Fast Visuomotor Policies via Diffusion Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhendong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoshuo Li</a>, <a href="/search/cs?searchtype=author&query=Mandlekar%2C+A">Ajay Mandlekar</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhenjia Xu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jiaojiao Fan</a>, <a href="/search/cs?searchtype=author&query=Narang%2C+Y">Yashraj Narang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Linxi Fan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a>, <a href="/search/cs?searchtype=author&query=Balaji%2C+Y">Yogesh Balaji</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Mingyuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Ming-Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yu Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21257v1-abstract-short" style="display: inline;"> Diffusion models, praised for their success in generative tasks, are increasingly being applied to robotics, demonstrating exceptional performance in behavior cloning. However, their slow generation process stemming from iterative denoising steps poses a challenge for real-time applications in resource-constrained robotics setups and dynamically changing environments. In this paper, we introduce t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21257v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21257v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21257v1-abstract-full" style="display: none;"> Diffusion models, praised for their success in generative tasks, are increasingly being applied to robotics, demonstrating exceptional performance in behavior cloning. However, their slow generation process stemming from iterative denoising steps poses a challenge for real-time applications in resource-constrained robotics setups and dynamically changing environments. In this paper, we introduce the One-Step Diffusion Policy (OneDP), a novel approach that distills knowledge from pre-trained diffusion policies into a single-step action generator, significantly accelerating response times for robotic control tasks. We ensure the distilled generator closely aligns with the original policy distribution by minimizing the Kullback-Leibler (KL) divergence along the diffusion chain, requiring only $2\%$-$10\%$ additional pre-training cost for convergence. We evaluated OneDP on 6 challenging simulation tasks as well as 4 self-designed real-world tasks using the Franka robot. The results demonstrate that OneDP not only achieves state-of-the-art success rates but also delivers an order-of-magnitude improvement in inference speed, boosting action prediction frequency from 1.5 Hz to 62 Hz, establishing its potential for dynamic and computationally constrained robotic applications. We share the project page at https://research.nvidia.com/labs/dir/onedp/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21257v1-abstract-full').style.display = 'none'; document.getElementById('2410.21257v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21229">arXiv:2410.21229</a> <span> [<a href="https://arxiv.org/pdf/2410.21229">pdf</a>, <a href="https://arxiv.org/format/2410.21229">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> HOVER: Versatile Neural Whole-Body Controller for Humanoid Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+T">Tairan He</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wenli Xiao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Toru Lin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhengyi Luo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhenjia Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhenyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Kautz%2C+J">Jan Kautz</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Changliu Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+G">Guanya Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Linxi Fan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21229v1-abstract-short" style="display: inline;"> Humanoid whole-body control requires adapting to diverse tasks such as navigation, loco-manipulation, and tabletop manipulation, each demanding a different mode of control. For example, navigation relies on root velocity tracking, while tabletop manipulation prioritizes upper-body joint angle tracking. Existing approaches typically train individual policies tailored to a specific command space, li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21229v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21229v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21229v1-abstract-full" style="display: none;"> Humanoid whole-body control requires adapting to diverse tasks such as navigation, loco-manipulation, and tabletop manipulation, each demanding a different mode of control. For example, navigation relies on root velocity tracking, while tabletop manipulation prioritizes upper-body joint angle tracking. Existing approaches typically train individual policies tailored to a specific command space, limiting their transferability across modes. We present the key insight that full-body kinematic motion imitation can serve as a common abstraction for all these tasks and provide general-purpose motor skills for learning multiple modes of whole-body control. Building on this, we propose HOVER (Humanoid Versatile Controller), a multi-mode policy distillation framework that consolidates diverse control modes into a unified policy. HOVER enables seamless transitions between control modes while preserving the distinct advantages of each, offering a robust and scalable solution for humanoid control across a wide range of modes. By eliminating the need for policy retraining for each control mode, our approach improves efficiency and flexibility for future humanoid applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21229v1-abstract-full').style.display = 'none'; document.getElementById('2410.21229v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: see https://hover-versatile-humanoid.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21012">arXiv:2410.21012</a> <span> [<a href="https://arxiv.org/pdf/2410.21012">pdf</a>, <a href="https://arxiv.org/format/2410.21012">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FACT: Examining the Effectiveness of Iterative Context Rewriting for Multi-fact Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinlin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Suyuchen Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Ziwen Xia</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+S">Sirui Hong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yun Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bang Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenglin Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21012v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are proficient at retrieving single facts from extended contexts, yet they struggle with tasks requiring the simultaneous retrieval of multiple facts, especially during generation. This paper identifies a novel "lost-in-the-middle" phenomenon, where LLMs progressively lose track of critical information throughout the generation process, resulting in incomplete or inacc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21012v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21012v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21012v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are proficient at retrieving single facts from extended contexts, yet they struggle with tasks requiring the simultaneous retrieval of multiple facts, especially during generation. This paper identifies a novel "lost-in-the-middle" phenomenon, where LLMs progressively lose track of critical information throughout the generation process, resulting in incomplete or inaccurate retrieval. To address this challenge, we introduce Find All Crucial Texts (FACT), an iterative retrieval method that refines context through successive rounds of rewriting. This approach enables models to capture essential facts incrementally, which are often overlooked in single-pass retrieval. Experiments demonstrate that FACT substantially enhances multi-fact retrieval performance across various tasks, though improvements are less notable in general-purpose QA scenarios. Our findings shed light on the limitations of LLMs in multi-fact retrieval and underscore the need for more resilient long-context retrieval strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21012v1-abstract-full').style.display = 'none'; document.getElementById('2410.21012v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in Progress</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository