Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 114 results for author: <span class="mathjax">Qi, C</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Qi%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Qi, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Qi%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Qi, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Qi%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Qi%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Qi%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Qi%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14318">arXiv:2411.14318</a> <span> [<a href="https://arxiv.org/pdf/2411.14318">pdf</a>, <a href="https://arxiv.org/format/2411.14318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Velocitune: A Velocity-based Dynamic Domain Reweighting Method for Continual Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zheheng Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoling Li</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yeyun Gong</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chen Qi</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Peng Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14318v1-abstract-short" style="display: inline;"> It is well-known that a diverse corpus is critical for training large language models, which are typically constructed from a mixture of various domains. In general, previous efforts resort to sampling training data from different domains with static proportions, as well as adjusting data proportions during training. However, few methods have addressed the complexities of domain-adaptive continual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14318v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14318v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14318v1-abstract-full" style="display: none;"> It is well-known that a diverse corpus is critical for training large language models, which are typically constructed from a mixture of various domains. In general, previous efforts resort to sampling training data from different domains with static proportions, as well as adjusting data proportions during training. However, few methods have addressed the complexities of domain-adaptive continual pre-training. To fill this gap, we propose Velocitune, a novel framework dynamically assesses learning velocity and adjusts data proportions accordingly, favoring slower-learning domains while shunning faster-learning ones, which is guided by a scaling law to indicate the desired learning goal for each domain with less associated cost. To evaluate the effectiveness of Velocitune, we conduct experiments in a reasoning-focused dataset with CodeLlama, as well as in a corpus specialised for system command generation with Llama3 and Mistral. Velocitune achieves performance gains in both math and code reasoning tasks and command-line generation benchmarks. Further analysis reveals that key factors driving Velocitune's effectiveness include target loss prediction and data ordering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14318v1-abstract-full').style.display = 'none'; document.getElementById('2411.14318v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03286">arXiv:2411.03286</a> <span> [<a href="https://arxiv.org/pdf/2411.03286">pdf</a>, <a href="https://arxiv.org/format/2411.03286">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiT4Edit: Diffusion Transformer for Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+K">Kunyu Feng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yue Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haozhe Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03286v2-abstract-short" style="display: inline;"> Despite recent advances in UNet-based image editing, methods for shape-aware object editing in high-resolution images are still lacking. Compared to UNet, Diffusion Transformers (DiT) demonstrate superior capabilities to effectively capture the long-range dependencies among patches, leading to higher-quality image generation. In this paper, we propose DiT4Edit, the first Diffusion Transformer-base… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03286v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03286v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03286v2-abstract-full" style="display: none;"> Despite recent advances in UNet-based image editing, methods for shape-aware object editing in high-resolution images are still lacking. Compared to UNet, Diffusion Transformers (DiT) demonstrate superior capabilities to effectively capture the long-range dependencies among patches, leading to higher-quality image generation. In this paper, we propose DiT4Edit, the first Diffusion Transformer-based image editing framework. Specifically, DiT4Edit uses the DPM-Solver inversion algorithm to obtain the inverted latents, reducing the number of steps compared to the DDIM inversion algorithm commonly used in UNet-based frameworks. Additionally, we design unified attention control and patches merging, tailored for transformer computation streams. This integration allows our framework to generate higher-quality edited images faster. Our design leverages the advantages of DiT, enabling it to surpass UNet structures in image editing, especially in high-resolution and arbitrary-size images. Extensive experiments demonstrate the strong performance of DiT4Edit across various editing scenarios, highlighting the potential of Diffusion Transformers in supporting image editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03286v2-abstract-full').style.display = 'none'; document.getElementById('2411.03286v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01472">arXiv:2411.01472</a> <span> [<a href="https://arxiv.org/pdf/2411.01472">pdf</a>, <a href="https://arxiv.org/format/2411.01472">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Domain Learning for Cross-domain Image Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+Z">Zian Qian</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Law%2C+K+L">Ka Lung Law</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Hao Fu</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+C">Chenyang Lei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01472v1-abstract-short" style="display: inline;"> Different camera sensors have different noise patterns, and thus an image denoising model trained on one sensor often does not generalize well to a different sensor. One plausible solution is to collect a large dataset for each sensor for training or fine-tuning, which is inevitably time-consuming. To address this cross-domain challenge, we present a novel adaptive domain learning (ADL) scheme for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01472v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01472v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01472v1-abstract-full" style="display: none;"> Different camera sensors have different noise patterns, and thus an image denoising model trained on one sensor often does not generalize well to a different sensor. One plausible solution is to collect a large dataset for each sensor for training or fine-tuning, which is inevitably time-consuming. To address this cross-domain challenge, we present a novel adaptive domain learning (ADL) scheme for cross-domain RAW image denoising by utilizing existing data from different sensors (source domain) plus a small amount of data from the new sensor (target domain). The ADL training scheme automatically removes the data in the source domain that are harmful to fine-tuning a model for the target domain (some data are harmful as adding them during training lowers the performance due to domain gaps). Also, we introduce a modulation module to adopt sensor-specific information (sensor type and ISO) to understand input data for image denoising. We conduct extensive experiments on public datasets with various smartphone and DSLR cameras, which show our proposed model outperforms prior work on cross-domain image denoising, given a small amount of image data from the target domain sensor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01472v1-abstract-full').style.display = 'none'; document.getElementById('2411.01472v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 3 figures, accepted by neurips 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11736">arXiv:2410.11736</a> <span> [<a href="https://arxiv.org/pdf/2410.11736">pdf</a>, <a href="https://arxiv.org/format/2410.11736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Near-Field Communications for Extremely Large-Scale MIMO: A Beamspace Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kangjian Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenhao Qi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingjia Huang</a>, <a href="/search/cs?searchtype=author&query=Dobre%2C+O+A">Octavia A. Dobre</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11736v1-abstract-short" style="display: inline;"> Extremely large-scale multiple-input multiple-output (XL-MIMO) is regarded as one of the key techniques to enhance the performance of future wireless communications. Different from regular MIMO, the XL-MIMO shifts part of the communication region from the far field to the near field, where the spherical-wave channel model cannot be accurately approximated by the commonly-adopted planar-wave channe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11736v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11736v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11736v1-abstract-full" style="display: none;"> Extremely large-scale multiple-input multiple-output (XL-MIMO) is regarded as one of the key techniques to enhance the performance of future wireless communications. Different from regular MIMO, the XL-MIMO shifts part of the communication region from the far field to the near field, where the spherical-wave channel model cannot be accurately approximated by the commonly-adopted planar-wave channel model. As a result, the well-explored far-field beamspace is unsuitable for near-field communications, thereby requiring the exploration of specialized near-field beamspace. In this article, we investigate the near-field communications for XL-MIMO from the perspective of beamspace. Given the spherical wavefront characteristics of the near-field channels, we first map the antenna space to the near-field beamspace with the fractional Fourier transform. Then, we divide the near-field beamspace into three parts, including high mainlobe, low mainlobe, and sidelobe, and provide a comprehensive analysis of these components. Based on the analysis, we demonstrate the advantages of the near-field beamspace over the existing methods. Finally, we point out several applications of the near-field beamspace and highlight some potential directions for future study in the near-field beamspace. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11736v1-abstract-full').style.display = 'none'; document.getElementById('2410.11736v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10127">arXiv:2409.10127</a> <span> [<a href="https://arxiv.org/pdf/2409.10127">pdf</a>, <a href="https://arxiv.org/ps/2409.10127">ps</a>, <a href="https://arxiv.org/format/2409.10127">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Joint Beamforming and Illumination Pattern Design for Beam-Hopping LEO Satellite Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenhao Qi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shui Yu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+S">Shiwen Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10127v1-abstract-short" style="display: inline;"> Since hybrid beamforming (HBF) can approach the performance of fully-digital beamforming (FDBF) with much lower hardware complexity, we investigate the HBF design for beam-hopping (BH) low earth orbit (LEO) satellite communications (SatComs). Aiming at maximizing the sum-rate of totally illuminated beam positions during the whole BH period, we consider joint beamforming and illumination pattern de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10127v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10127v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10127v1-abstract-full" style="display: none;"> Since hybrid beamforming (HBF) can approach the performance of fully-digital beamforming (FDBF) with much lower hardware complexity, we investigate the HBF design for beam-hopping (BH) low earth orbit (LEO) satellite communications (SatComs). Aiming at maximizing the sum-rate of totally illuminated beam positions during the whole BH period, we consider joint beamforming and illumination pattern design subject to the HBF constraints and sum-rate requirements. To address the non-convexity of the HBF constraints, we temporarily replace the HBF constraints with the FDBF constraints. Then we propose an FDBF and illumination pattern random search (FDBF-IPRS) scheme to optimize illumination patterns and fully-digital beamformers using constrained random search and fractional programming methods. To further reduce the computational complexity, we propose an FDBF and illumination pattern alternating optimization (FDBF-IPAO) scheme, where we relax the integer illumination pattern to continuous variables and after finishing all the iterations we quantize the continuous variables into integer ones. Based on the fully-digital beamformers designed by the FDBF-IPRS or FDBF-IPAO scheme, we propose an HBF alternating minimization algorithm to design the hybrid beamformers. Simulation results show that the proposed schemes can achieve satisfactory sum-rate performance for BH LEO SatComs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10127v1-abstract-full').style.display = 'none'; document.getElementById('2409.10127v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10071">arXiv:2409.10071</a> <span> [<a href="https://arxiv.org/pdf/2409.10071">pdf</a>, <a href="https://arxiv.org/format/2409.10071">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Towards Physically-Realizable Adversarial Attacks in Embodied Vision Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+M">Meng Chen</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+J">Jiawei Tu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chao Qi</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Y">Yonghao Dang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Feng Zhou</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+W">Wei Wei</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianqin Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10071v3-abstract-short" style="display: inline;"> The deployment of embodied navigation agents in safety-critical environments raises concerns about their vulnerability to adversarial attacks on deep neural networks. However, current attack methods often lack practicality due to challenges in transitioning from the digital to the physical world, while existing physical attacks for object detection fail to achieve both multi-view effectiveness and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10071v3-abstract-full').style.display = 'inline'; document.getElementById('2409.10071v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10071v3-abstract-full" style="display: none;"> The deployment of embodied navigation agents in safety-critical environments raises concerns about their vulnerability to adversarial attacks on deep neural networks. However, current attack methods often lack practicality due to challenges in transitioning from the digital to the physical world, while existing physical attacks for object detection fail to achieve both multi-view effectiveness and naturalness. To address this, we propose a practical attack method for embodied navigation by attaching adversarial patches with learnable textures and opacity to objects. Specifically, to ensure effectiveness across varying viewpoints, we employ a multi-view optimization strategy based on object-aware sampling, which uses feedback from the navigation model to optimize the patch's texture. To make the patch inconspicuous to human observers, we introduce a two-stage opacity optimization mechanism, where opacity is refined after texture optimization. Experimental results show our adversarial patches reduce navigation success rates by about 40%, outperforming previous methods in practicality, effectiveness, and naturalness. Code is available at: [https://github.com/chen37058/Physical-Attacks-in-Embodied-Navigation]. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10071v3-abstract-full').style.display = 'none'; document.getElementById('2409.10071v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures, submitted to the 2025 IEEE International Conference on Robotics & Automation (ICRA)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08572">arXiv:2409.08572</a> <span> [<a href="https://arxiv.org/pdf/2409.08572">pdf</a>, <a href="https://arxiv.org/format/2409.08572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiffFAS: Face Anti-Spoofing via Generative Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+X">Xinxu Ge</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zitong Yu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jingang Shi</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chun Qi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jie Li</a>, <a href="/search/cs?searchtype=author&query=K%C3%A4lvi%C3%A4inen%2C+H">Heikki K盲lvi盲inen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08572v1-abstract-short" style="display: inline;"> Face anti-spoofing (FAS) plays a vital role in preventing face recognition (FR) systems from presentation attacks. Nowadays, FAS systems face the challenge of domain shift, impacting the generalization performance of existing FAS methods. In this paper, we rethink about the inherence of domain shift and deconstruct it into two factors: image style and image quality. Quality influences the purity o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08572v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08572v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08572v1-abstract-full" style="display: none;"> Face anti-spoofing (FAS) plays a vital role in preventing face recognition (FR) systems from presentation attacks. Nowadays, FAS systems face the challenge of domain shift, impacting the generalization performance of existing FAS methods. In this paper, we rethink about the inherence of domain shift and deconstruct it into two factors: image style and image quality. Quality influences the purity of the presentation of spoof information, while style affects the manner in which spoof information is presented. Based on our analysis, we propose DiffFAS framework, which quantifies quality as prior information input into the network to counter image quality shift, and performs diffusion-based high-fidelity cross-domain and cross-attack types generation to counter image style shift. DiffFAS transforms easily collectible live faces into high-fidelity attack faces with precise labels while maintaining consistency between live and spoof face identities, which can also alleviate the scarcity of labeled data with novel type attacks faced by nowadays FAS system. We demonstrate the effectiveness of our framework on challenging cross-domain and cross-attack FAS datasets, achieving the state-of-the-art performance. Available at https://github.com/murphytju/DiffFAS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08572v1-abstract-full').style.display = 'none'; document.getElementById('2409.08572v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01163">arXiv:2407.01163</a> <span> [<a href="https://arxiv.org/pdf/2407.01163">pdf</a>, <a href="https://arxiv.org/format/2407.01163">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Predictive Coding Networks -- Made Simple </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pinchetti%2C+L">Luca Pinchetti</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chang Qi</a>, <a href="/search/cs?searchtype=author&query=Lokshyn%2C+O">Oleh Lokshyn</a>, <a href="/search/cs?searchtype=author&query=Olivers%2C+G">Gaspard Olivers</a>, <a href="/search/cs?searchtype=author&query=Emde%2C+C">Cornelius Emde</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+M">Mufeng Tang</a>, <a href="/search/cs?searchtype=author&query=M%27Charrak%2C+A">Amine M'Charrak</a>, <a href="/search/cs?searchtype=author&query=Frieder%2C+S">Simon Frieder</a>, <a href="/search/cs?searchtype=author&query=Menzat%2C+B">Bayar Menzat</a>, <a href="/search/cs?searchtype=author&query=Bogacz%2C+R">Rafal Bogacz</a>, <a href="/search/cs?searchtype=author&query=Lukasiewicz%2C+T">Thomas Lukasiewicz</a>, <a href="/search/cs?searchtype=author&query=Salvatori%2C+T">Tommaso Salvatori</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01163v1-abstract-short" style="display: inline;"> In this work, we tackle the problems of efficiency and scalability for predictive coding networks in machine learning. To do so, we first propose a library called PCX, whose focus lies on performance and simplicity, and provides a user-friendly, deep-learning oriented interface. Second, we use PCX to implement a large set of benchmarks for the community to use for their experiments. As most works… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01163v1-abstract-full').style.display = 'inline'; document.getElementById('2407.01163v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01163v1-abstract-full" style="display: none;"> In this work, we tackle the problems of efficiency and scalability for predictive coding networks in machine learning. To do so, we first propose a library called PCX, whose focus lies on performance and simplicity, and provides a user-friendly, deep-learning oriented interface. Second, we use PCX to implement a large set of benchmarks for the community to use for their experiments. As most works propose their own tasks and architectures, do not compare one against each other, and focus on small-scale tasks, a simple and fast open-source library adopted by the whole community would address all of these concerns. Third, we perform extensive benchmarks using multiple algorithms, setting new state-of-the-art results in multiple tasks and datasets, as well as highlighting limitations inherent to PC that should be addressed. Thanks to the efficiency of PCX, we are able to analyze larger architectures than commonly used, providing baselines to galvanize community efforts towards one of the main open problems in the field: scalability. The code for PCX is available at \textit{https://github.com/liukidar/pcax}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01163v1-abstract-full').style.display = 'none'; document.getElementById('2407.01163v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, 25 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09238">arXiv:2406.09238</a> <span> [<a href="https://arxiv.org/pdf/2406.09238">pdf</a>, <a href="https://arxiv.org/format/2406.09238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Near-Field Multiuser Communications based on Sparse Arrays </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kangjian Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenhao Qi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a>, <a href="/search/cs?searchtype=author&query=Dobre%2C+O+A">Octavia A. Dobre</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09238v1-abstract-short" style="display: inline;"> This paper considers near-field multiuser communications based on sparse arrays (SAs). First, for the uniform SAs (USAs), we analyze the beam gains of channel steering vectors, which shows that increasing the antenna spacings can effectively improve the spatial resolution of the antenna arrays to enhance the sum rate of multiuser communications. Then, we investigate nonuniform SAs (NSAs) to mitiga… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09238v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09238v1-abstract-full" style="display: none;"> This paper considers near-field multiuser communications based on sparse arrays (SAs). First, for the uniform SAs (USAs), we analyze the beam gains of channel steering vectors, which shows that increasing the antenna spacings can effectively improve the spatial resolution of the antenna arrays to enhance the sum rate of multiuser communications. Then, we investigate nonuniform SAs (NSAs) to mitigate the high multiuser interference from the grating lobes of the USAs. To maximize the sum rate of near-field multiuser communications, we optimize the antenna positions of the NSAs, where a successive convex approximation-based antenna position optimization algorithm is proposed. Moreover, we find that the channels of both the USAs and the NSAs show uniform sparsity in the defined surrogate distance-angle (SD-A) domain. Based on the channel sparsity, an on-grid SD-A-domain orthogonal matching pursuit (SDA-OMP) algorithm is developed to estimate multiuser channels. To further improve the resolution of the SDA-OMP, we also design an off-grid SD-A-domain iterative super-resolution channel estimation algorithm. Simulation results demonstrate the superior performance of the proposed methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09238v1-abstract-full').style.display = 'none'; document.getElementById('2406.09238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.03113">arXiv:2405.03113</a> <span> [<a href="https://arxiv.org/pdf/2405.03113">pdf</a>, <a href="https://arxiv.org/format/2405.03113">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Robot Air Hockey: A Manipulation Testbed for Robot Learning with Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chuck%2C+C">Caleb Chuck</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Carl Qi</a>, <a href="/search/cs?searchtype=author&query=Munje%2C+M+J">Michael J. Munje</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuozhe Li</a>, <a href="/search/cs?searchtype=author&query=Rudolph%2C+M">Max Rudolph</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+C">Chang Shi</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+S">Siddhant Agarwal</a>, <a href="/search/cs?searchtype=author&query=Sikchi%2C+H">Harshit Sikchi</a>, <a href="/search/cs?searchtype=author&query=Peri%2C+A">Abhinav Peri</a>, <a href="/search/cs?searchtype=author&query=Dayal%2C+S">Sarthak Dayal</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+E">Evan Kuo</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+K">Kavan Mehta</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Anthony Wang</a>, <a href="/search/cs?searchtype=author&query=Stone%2C+P">Peter Stone</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Amy Zhang</a>, <a href="/search/cs?searchtype=author&query=Niekum%2C+S">Scott Niekum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.03113v1-abstract-short" style="display: inline;"> Reinforcement Learning is a promising tool for learning complex policies even in fast-moving and object-interactive domains where human teleoperation or hard-coded policies might fail. To effectively reflect this challenging category of tasks, we introduce a dynamic, interactive RL testbed based on robot air hockey. By augmenting air hockey with a large family of tasks ranging from easy tasks like… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03113v1-abstract-full').style.display = 'inline'; document.getElementById('2405.03113v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.03113v1-abstract-full" style="display: none;"> Reinforcement Learning is a promising tool for learning complex policies even in fast-moving and object-interactive domains where human teleoperation or hard-coded policies might fail. To effectively reflect this challenging category of tasks, we introduce a dynamic, interactive RL testbed based on robot air hockey. By augmenting air hockey with a large family of tasks ranging from easy tasks like reaching, to challenging ones like pushing a block by hitting it with a puck, as well as goal-based and human-interactive tasks, our testbed allows a varied assessment of RL capabilities. The robot air hockey testbed also supports sim-to-real transfer with three domains: two simulators of increasing fidelity and a real robot system. Using a dataset of demonstration data gathered through two teleoperation systems: a virtualized control environment, and human shadowing, we assess the testbed with behavior cloning, offline RL, and RL from scratch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03113v1-abstract-full').style.display = 'none'; document.getElementById('2405.03113v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02243">arXiv:2405.02243</a> <span> [<a href="https://arxiv.org/pdf/2405.02243">pdf</a>, <a href="https://arxiv.org/format/2405.02243">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Towards Improving Learning from Demonstration Algorithms via MCMC Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Carl Qi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+E">Edward Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Harry Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02243v3-abstract-short" style="display: inline;"> Behavioral cloning, or more broadly, learning from demonstrations (LfD) is a priomising direction for robot policy learning in complex scenarios. Albeit being straightforward to implement and data-efficient, behavioral cloning has its own drawbacks, limiting its efficacy in real robot setups. In this work, we take one step towards improving learning from demonstration algorithms by leveraging impl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02243v3-abstract-full').style.display = 'inline'; document.getElementById('2405.02243v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02243v3-abstract-full" style="display: none;"> Behavioral cloning, or more broadly, learning from demonstrations (LfD) is a priomising direction for robot policy learning in complex scenarios. Albeit being straightforward to implement and data-efficient, behavioral cloning has its own drawbacks, limiting its efficacy in real robot setups. In this work, we take one step towards improving learning from demonstration algorithms by leveraging implicit energy-based policy models. Results suggest that in selected complex robot policy learning scenarios, treating supervised policy learning with an implicit model generally performs better, on average, than commonly used neural network-based explicit models, especially in the cases of approximating potentially discontinuous and multimodal functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02243v3-abstract-full').style.display = 'none'; document.getElementById('2405.02243v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2207.04638, arXiv:2204.03597 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19531">arXiv:2404.19531</a> <span> [<a href="https://arxiv.org/pdf/2404.19531">pdf</a>, <a href="https://arxiv.org/format/2404.19531">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MoST: Multi-modality Scene Tokenization for Motion Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mu%2C+N">Norman Mu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jingwei Ji</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenpei Yang</a>, <a href="/search/cs?searchtype=author&query=Harada%2C+N">Nate Harada</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Haotian Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kan Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C+R">Charles R. Qi</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+R">Runzhou Ge</a>, <a href="/search/cs?searchtype=author&query=Goel%2C+K">Kratarth Goel</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zoey Yang</a>, <a href="/search/cs?searchtype=author&query=Ettinger%2C+S">Scott Ettinger</a>, <a href="/search/cs?searchtype=author&query=Al-Rfou%2C+R">Rami Al-Rfou</a>, <a href="/search/cs?searchtype=author&query=Anguelov%2C+D">Dragomir Anguelov</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19531v1-abstract-short" style="display: inline;"> Many existing motion prediction approaches rely on symbolic perception outputs to generate agent trajectories, such as bounding boxes, road graph information and traffic lights. This symbolic representation is a high-level abstraction of the real world, which may render the motion prediction model vulnerable to perception errors (e.g., failures in detecting open-vocabulary obstacles) while missing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19531v1-abstract-full').style.display = 'inline'; document.getElementById('2404.19531v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19531v1-abstract-full" style="display: none;"> Many existing motion prediction approaches rely on symbolic perception outputs to generate agent trajectories, such as bounding boxes, road graph information and traffic lights. This symbolic representation is a high-level abstraction of the real world, which may render the motion prediction model vulnerable to perception errors (e.g., failures in detecting open-vocabulary obstacles) while missing salient information from the scene context (e.g., poor road conditions). An alternative paradigm is end-to-end learning from raw sensors. However, this approach suffers from the lack of interpretability and requires significantly more training resources. In this work, we propose tokenizing the visual world into a compact set of scene elements and then leveraging pre-trained image foundation models and LiDAR neural networks to encode all the scene elements in an open-vocabulary manner. The image foundation model enables our scene tokens to encode the general knowledge of the open world while the LiDAR neural network encodes geometry information. Our proposed representation can efficiently encode the multi-frame multi-modality observations with a few hundred tokens and is compatible with most transformer-based architectures. To evaluate our method, we have augmented Waymo Open Motion Dataset with camera embeddings. Experiments over Waymo Open Motion Dataset show that our approach leads to significant performance improvements over the state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19531v1-abstract-full').style.display = 'none'; document.getElementById('2404.19531v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08268">arXiv:2403.08268</a> <span> [<a href="https://arxiv.org/pdf/2403.08268">pdf</a>, <a href="https://arxiv.org/format/2403.08268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Follow-Your-Click: Open-domain Regional Image Animation via Short Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yue Ma</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yingqing He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongfa Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Andong Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+C">Chengfei Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiu Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhifeng Li</a>, <a href="/search/cs?searchtype=author&query=Shum%2C+H">Heung-Yeung Shum</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08268v1-abstract-short" style="display: inline;"> Despite recent advances in image-to-video generation, better controllability and local animation are less explored. Most existing image-to-video methods are not locally aware and tend to move the entire scene. However, human artists may need to control the movement of different objects or regions. Additionally, current I2V methods require users not only to describe the target motion but also to pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08268v1-abstract-full').style.display = 'inline'; document.getElementById('2403.08268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08268v1-abstract-full" style="display: none;"> Despite recent advances in image-to-video generation, better controllability and local animation are less explored. Most existing image-to-video methods are not locally aware and tend to move the entire scene. However, human artists may need to control the movement of different objects or regions. Additionally, current I2V methods require users not only to describe the target motion but also to provide redundant detailed descriptions of frame contents. These two issues hinder the practical utilization of current I2V tools. In this paper, we propose a practical framework, named Follow-Your-Click, to achieve image animation with a simple user click (for specifying what to move) and a short motion prompt (for specifying how to move). Technically, we propose the first-frame masking strategy, which significantly improves the video generation quality, and a motion-augmented module equipped with a short motion prompt dataset to improve the short prompt following abilities of our model. To further control the motion speed, we propose flow-based motion magnitude control to control the speed of target movement more precisely. Our framework has simpler yet precise user control and better generation performance than previous methods. Extensive experiments compared with 7 baselines, including both commercial tools and research methods on 8 metrics, suggest the superiority of our approach. Project Page: https://follow-your-click.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08268v1-abstract-full').style.display = 'none'; document.getElementById('2403.08268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://follow-your-click.github.io/ Github Page: https://github.com/mayuelala/FollowYourClick</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.02321">arXiv:2402.02321</a> <span> [<a href="https://arxiv.org/pdf/2402.02321">pdf</a>, <a href="https://arxiv.org/format/2402.02321">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Active Learning for Graphs with Noisy Structures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chi%2C+H">Hongliang Chi</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Cong Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Suhang Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yao Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.02321v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have seen significant success in tasks such as node classification, largely contingent upon the availability of sufficient labeled nodes. Yet, the excessive cost of labeling large-scale graphs led to a focus on active learning on graphs, which aims for effective data selection to maximize downstream model performance. Notably, most existing methods assume reliable grap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02321v1-abstract-full').style.display = 'inline'; document.getElementById('2402.02321v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.02321v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have seen significant success in tasks such as node classification, largely contingent upon the availability of sufficient labeled nodes. Yet, the excessive cost of labeling large-scale graphs led to a focus on active learning on graphs, which aims for effective data selection to maximize downstream model performance. Notably, most existing methods assume reliable graph topology, while real-world scenarios often present noisy graphs. Given this, designing a successful active learning framework for noisy graphs is highly needed but challenging, as selecting data for labeling and obtaining a clean graph are two tasks naturally interdependent: selecting high-quality data requires clean graph structure while cleaning noisy graph structure requires sufficient labeled data. Considering the complexity mentioned above, we propose an active learning framework, GALClean, which has been specifically designed to adopt an iterative approach for conducting both data selection and graph purification simultaneously with best information learned from the prior iteration. Importantly, we summarize GALClean as an instance of the Expectation-Maximization algorithm, which provides a theoretical understanding of its design and mechanisms. This theory naturally leads to an enhanced version, GALClean+. Extensive experiments have demonstrated the effectiveness and robustness of our proposed method across various types and levels of noisy graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02321v1-abstract-full').style.display = 'none'; document.getElementById('2402.02321v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.11195">arXiv:2401.11195</a> <span> [<a href="https://arxiv.org/pdf/2401.11195">pdf</a>, <a href="https://arxiv.org/format/2401.11195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TWC.2024.3351712">10.1109/TWC.2024.3351712 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Triple-Refined Hybrid-Field Beam Training for mmWave Extremely Large-Scale MIMO </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kangjian Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenhao Qi</a>, <a href="/search/cs?searchtype=author&query=Dobre%2C+O+A">Octavia A. Dobre</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.11195v1-abstract-short" style="display: inline;"> This paper investigates beam training for extremely large-scale multiple-input multiple-output systems. By considering both the near field and far field, a triple-refined hybrid-field beam training scheme is proposed, where high-accuracy estimates of channel parameters are obtained through three steps of progressive beam refinement. First, the hybrid-field beam gain (HFBG)-based first refinement m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11195v1-abstract-full').style.display = 'inline'; document.getElementById('2401.11195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.11195v1-abstract-full" style="display: none;"> This paper investigates beam training for extremely large-scale multiple-input multiple-output systems. By considering both the near field and far field, a triple-refined hybrid-field beam training scheme is proposed, where high-accuracy estimates of channel parameters are obtained through three steps of progressive beam refinement. First, the hybrid-field beam gain (HFBG)-based first refinement method is developed. Based on the analysis of the HFBG, the first-refinement codebook is designed and the beam training is performed accordingly to narrow down the potential region of the channel path. Then, the maximum likelihood (ML)-based and principle of stationary phase (PSP)-based second refinement methods are developed. By exploiting the measurements of the beam training, the ML is used to estimate the channel parameters. To avoid the high computational complexity of ML, closed-form estimates of the channel parameters are derived according to the PSP. Moreover, the Gaussian approximation (GA)-based third refinement method is developed. The hybrid-field neighboring search is first performed to identify the potential region of the main lobe of the channel steering vector. Afterwards, by applying the GA, a least-squares estimator is developed to obtain the high-accuracy channel parameter estimation. Simulation results verify the effectiveness of the proposed scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11195v1-abstract-full').style.display = 'none'; document.getElementById('2401.11195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Wireless Communications, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.08976">arXiv:2401.08976</a> <span> [<a href="https://arxiv.org/pdf/2401.08976">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> ACT-GAN: Radio map construction based on generative adversarial networks with ACT blocks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chen Qi</a>, <a href="/search/cs?searchtype=author&query=Jingjing%2C+Y">Yang Jingjing</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+H">Huang Ming</a>, <a href="/search/cs?searchtype=author&query=Qiang%2C+Z">Zhou Qiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.08976v1-abstract-short" style="display: inline;"> The radio map, serving as a visual representation of electromagnetic spatial characteristics, plays a pivotal role in assessment of wireless communication networks and radio monitoring coverage. Addressing the issue of low accuracy existing in the current radio map construction, this paper presents a novel radio map construction method based on generative adversarial network (GAN) in which the Agg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08976v1-abstract-full').style.display = 'inline'; document.getElementById('2401.08976v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.08976v1-abstract-full" style="display: none;"> The radio map, serving as a visual representation of electromagnetic spatial characteristics, plays a pivotal role in assessment of wireless communication networks and radio monitoring coverage. Addressing the issue of low accuracy existing in the current radio map construction, this paper presents a novel radio map construction method based on generative adversarial network (GAN) in which the Aggregated Contextual-Transformation (AOT) block, Convolutional Block Attention Module (CBAM), and Transposed Convolution (T-Conv) block are applied to the generator, and we name it as ACT-GAN. It significantly improves the reconstruction accuracy and local texture of the radio maps. The performance of ACT-GAN across three different scenarios is demonstrated. Experiment results reveal that in the scenario without sparse discrete observations, the proposed method reduces the root mean square error (RMSE) by 14.6% in comparison to the state-of-the-art models. In the scenario with sparse discrete observations, the RMSE is diminished by 13.2%. Furthermore, the predictive results of the proposed model show a more lucid representation of electromagnetic spatial field distribution. To verify the universality of this model in radio map construction tasks, the scenario of unknown radio emission source is investigated. The results indicate that the proposed model is robust radio map construction and accurate in predicting the location of the emission source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08976v1-abstract-full').style.display = 'none'; document.getElementById('2401.08976v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11595">arXiv:2312.11595</a> <span> [<a href="https://arxiv.org/pdf/2312.11595">pdf</a>, <a href="https://arxiv.org/format/2312.11595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SPIRE: Semantic Prompt-Driven Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhengzhong Tu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+K">Keren Ye</a>, <a href="/search/cs?searchtype=author&query=Delbracio%2C+M">Mauricio Delbracio</a>, <a href="/search/cs?searchtype=author&query=Milanfar%2C+P">Peyman Milanfar</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Talebi%2C+H">Hossein Talebi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11595v2-abstract-short" style="display: inline;"> Text-driven diffusion models have become increasingly popular for various image editing tasks, including inpainting, stylization, and object replacement. However, it still remains an open research problem to adopt this language-vision paradigm for more fine-level image processing tasks, such as denoising, super-resolution, deblurring, and compression artifact removal. In this paper, we develop SPI… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11595v2-abstract-full').style.display = 'inline'; document.getElementById('2312.11595v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11595v2-abstract-full" style="display: none;"> Text-driven diffusion models have become increasingly popular for various image editing tasks, including inpainting, stylization, and object replacement. However, it still remains an open research problem to adopt this language-vision paradigm for more fine-level image processing tasks, such as denoising, super-resolution, deblurring, and compression artifact removal. In this paper, we develop SPIRE, a Semantic and restoration Prompt-driven Image Restoration framework that leverages natural language as a user-friendly interface to control the image restoration process. We consider the capacity of prompt information in two dimensions. First, we use content-related prompts to enhance the semantic alignment, effectively alleviating identity ambiguity in the restoration outcomes. Second, our approach is the first framework that supports fine-level instruction through language-based quantitative specification of the restoration strength, without the need for explicit task-specific design. In addition, we introduce a novel fusion mechanism that augments the existing ControlNet architecture by learning to rescale the generative prior, thereby achieving better restoration fidelity. Our extensive experiments demonstrate the superior restoration performance of SPIRE compared to the state of the arts, alongside offering the flexibility of text-based control over the restoration effects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11595v2-abstract-full').style.display = 'none'; document.getElementById('2312.11595v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024; Webpage: https://chenyangqiqi.github.io/tip</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03793">arXiv:2312.03793</a> <span> [<a href="https://arxiv.org/pdf/2312.03793">pdf</a>, <a href="https://arxiv.org/format/2312.03793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AnimateZero: Video Diffusion Models are Zero-Shot Image Animators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiwen Yu</a>, <a href="/search/cs?searchtype=author&query=Cun%2C+X">Xiaodong Cun</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xintao Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Ying Shan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03793v1-abstract-short" style="display: inline;"> Large-scale text-to-video (T2V) diffusion models have great progress in recent years in terms of visual quality, motion and temporal consistency. However, the generation process is still a black box, where all attributes (e.g., appearance, motion) are learned and generated jointly without precise control ability other than rough text descriptions. Inspired by image animation which decouples the vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03793v1-abstract-full').style.display = 'inline'; document.getElementById('2312.03793v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03793v1-abstract-full" style="display: none;"> Large-scale text-to-video (T2V) diffusion models have great progress in recent years in terms of visual quality, motion and temporal consistency. However, the generation process is still a black box, where all attributes (e.g., appearance, motion) are learned and generated jointly without precise control ability other than rough text descriptions. Inspired by image animation which decouples the video as one specific appearance with the corresponding motion, we propose AnimateZero to unveil the pre-trained text-to-video diffusion model, i.e., AnimateDiff, and provide more precise appearance and motion control abilities for it. For appearance control, we borrow intermediate latents and their features from the text-to-image (T2I) generation for ensuring the generated first frame is equal to the given generated image. For temporal control, we replace the global temporal attention of the original T2V model with our proposed positional-corrected window attention to ensure other frames align with the first frame well. Empowered by the proposed methods, AnimateZero can successfully control the generating progress without further training. As a zero-shot image animator for given images, AnimateZero also enables multiple new applications, including interactive video generation and real image animation. The detailed experiments demonstrate the effectiveness of the proposed method in both T2V and related applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03793v1-abstract-full').style.display = 'none'; document.getElementById('2312.03793v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://vvictoryuki.github.io/animatezero.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03047">arXiv:2312.03047</a> <span> [<a href="https://arxiv.org/pdf/2312.03047">pdf</a>, <a href="https://arxiv.org/format/2312.03047">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MagicStick: Controllable Video Editing via Control Handle Transformations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yue Ma</a>, <a href="/search/cs?searchtype=author&query=Cun%2C+X">Xiaodong Cun</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Sen Liang</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+J">Jinbo Xing</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yingqing He</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siran Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03047v2-abstract-short" style="display: inline;"> Text-based video editing has recently attracted considerable interest in changing the style or replacing the objects with a similar structure. Beyond this, we demonstrate that properties such as shape, size, location, motion, etc., can also be edited in videos. Our key insight is that the keyframe transformations of the specific internal feature (e.g., edge maps of objects or human pose), can easi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03047v2-abstract-full').style.display = 'inline'; document.getElementById('2312.03047v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03047v2-abstract-full" style="display: none;"> Text-based video editing has recently attracted considerable interest in changing the style or replacing the objects with a similar structure. Beyond this, we demonstrate that properties such as shape, size, location, motion, etc., can also be edited in videos. Our key insight is that the keyframe transformations of the specific internal feature (e.g., edge maps of objects or human pose), can easily propagate to other frames to provide generation guidance. We thus propose MagicStick, a controllable video editing method that edits the video properties by utilizing the transformation on the extracted internal control signals. In detail, to keep the appearance, we inflate both the pretrained image diffusion model and ControlNet to the temporal dimension and train low-rank adaptions (LORA) layers to fit the specific scenes. Then, in editing, we perform an inversion and editing framework. Differently, finetuned ControlNet is introduced in both inversion and generation for attention guidance with the proposed attention remix between the spatial attention maps of inversion and editing. Yet succinct, our method is the first method to show the ability of video property editing from the pre-trained text-to-image model. We present experiments on numerous examples within our unified framework. We also compare with shape-aware text-based editing and handcrafted motion video generation, demonstrating our superior temporal consistency and editing capability than previous works. The code and models are available on https://github.com/mayuelala/MagicStick. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03047v2-abstract-full').style.display = 'none'; document.getElementById('2312.03047v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV 2025, Project page: https://magic-stick-edit.github.io/ Github repository: https://github.com/mayuelala/MagicStick</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.16545">arXiv:2311.16545</a> <span> [<a href="https://arxiv.org/pdf/2311.16545">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Unravelling DNS Performance: A Historical Examination of F-ROOT in Southeast Asia </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiajia Zhu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chao Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.16545v1-abstract-short" style="display: inline;"> The DNS root server system uses Anycast technology to provide resolution through widely distributed root nodes. In recent years, the F-root node has seen astonishing growth and now boasts the largest number of nodes among the 13 root servers. Based on Ripe Atlas measurement data, we examined the availability and query latency of the F-root within the Southeast Asian region historically. The collec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16545v1-abstract-full').style.display = 'inline'; document.getElementById('2311.16545v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.16545v1-abstract-full" style="display: none;"> The DNS root server system uses Anycast technology to provide resolution through widely distributed root nodes. In recent years, the F-root node has seen astonishing growth and now boasts the largest number of nodes among the 13 root servers. Based on Ripe Atlas measurement data, we examined the availability and query latency of the F-root within the Southeast Asian region historically. The collected data illustrates how latency varies with changes in the number of root nodes, how the geographic distribution of responding root nodes changes in different periods, and examines the most recent differences between countries in terms of latency distribution. This study sheds light on the evolving landscape of DNS infrastructure in Southeast Asia. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16545v1-abstract-full').style.display = 'none'; document.getElementById('2311.16545v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages,4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15069">arXiv:2311.15069</a> <span> [<a href="https://arxiv.org/pdf/2311.15069">pdf</a>, <a href="https://arxiv.org/ps/2311.15069">ps</a>, <a href="https://arxiv.org/format/2311.15069">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Multiuser Beamforming for Partially-Connected Millimeter Wave Massive MIMO </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenhao Qi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinlin Hu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yang Du</a>, <a href="/search/cs?searchtype=author&query=Nallanathan%2C+A">Arumugam Nallanathan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15069v1-abstract-short" style="display: inline;"> Multiuser beamforming is considered for partially-connected millimeter wave massive MIMO systems. Based on perfect channel state information (CSI), a low-complexity hybrid beamforming scheme that decouples the analog beamformer and the digital beamformer is proposed to maximize the sum-rate. The analog beamformer design is modeled as a phase alignment problem to harvest the array gain. Given the a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15069v1-abstract-full').style.display = 'inline'; document.getElementById('2311.15069v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15069v1-abstract-full" style="display: none;"> Multiuser beamforming is considered for partially-connected millimeter wave massive MIMO systems. Based on perfect channel state information (CSI), a low-complexity hybrid beamforming scheme that decouples the analog beamformer and the digital beamformer is proposed to maximize the sum-rate. The analog beamformer design is modeled as a phase alignment problem to harvest the array gain. Given the analog beamformer, the digital beamformer is designed by solving a weighted minimum mean squared error problem. Then based on imperfect CSI, an analog-only beamformer design scheme is proposed, where the design problem aims at maximizing the desired signal power on the current user and minimizing the power on the other users to mitigate the multiuser interference. The original problem is then transformed into a series of independent beam nulling subproblems, where an efficient iterative algorithm using the majorization-minimization framework is proposed to solve the subproblems. Simulation results show that, under perfect CSI, the proposed scheme achieves almost the same sum-rate performance as the existing schemes but with lower computational complexity; and under imperfect CSI, the proposed analog-only beamforming design scheme can effectively mitigate the multiuser interference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15069v1-abstract-full').style.display = 'none'; document.getElementById('2311.15069v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15066">arXiv:2311.15066</a> <span> [<a href="https://arxiv.org/pdf/2311.15066">pdf</a>, <a href="https://arxiv.org/format/2311.15066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Beam Training and Tracking for Extremely Large-Scale MIMO Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kangjian Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenhao Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng-Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15066v1-abstract-short" style="display: inline;"> In this paper, beam training and beam tracking are investigated for extremely large-scale multiple-input-multiple-output communication systems with partially-connected hybrid combining structures. Firstly, we propose a two-stage hybrid-field beam training scheme for both the near field and the far field. In the first stage, each subarray independently uses multiple far-field channel steering vecto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15066v1-abstract-full').style.display = 'inline'; document.getElementById('2311.15066v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15066v1-abstract-full" style="display: none;"> In this paper, beam training and beam tracking are investigated for extremely large-scale multiple-input-multiple-output communication systems with partially-connected hybrid combining structures. Firstly, we propose a two-stage hybrid-field beam training scheme for both the near field and the far field. In the first stage, each subarray independently uses multiple far-field channel steering vectors to approximate near-field ones for analog combining. To find the codeword best fitting for the channel, digital combiners in the second stage are designed to combine the outputs of the analog combiners from the first stage. Then, based on the principle of stationary phase and the time-frequency duality, the expressions of subarray signals after analog combining are analytically derived and a beam refinement based on phase shifts of subarrays~(BRPSS) scheme with closed-form solutions is proposed for high-resolution channel parameter estimation. Moreover, a low-complexity near-field beam tracking scheme is developed, where the kinematic model is adopted to characterize the channel variations and the extended Kalman filter is exploited for beam tracking. Simulation results verify the effectiveness of the proposed schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15066v1-abstract-full').style.display = 'none'; document.getElementById('2311.15066v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15062">arXiv:2311.15062</a> <span> [<a href="https://arxiv.org/pdf/2311.15062">pdf</a>, <a href="https://arxiv.org/format/2311.15062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Simultaneous Beam Training and Target Sensing in ISAC Systems with RIS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kangjian Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenhao Qi</a>, <a href="/search/cs?searchtype=author&query=Dobre%2C+O+A">Octavia A. Dobre</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15062v1-abstract-short" style="display: inline;"> This paper investigates an integrated sensing and communication (ISAC) system with reconfigurable intelligent surface (RIS). Our simultaneous beam training and target sensing (SBTTS) scheme enables the base station to perform beam training with the user terminals (UTs) and the RIS, and simultaneously to sense the targets. Based on our findings, the energy of the echoes from the RIS is accumulated… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15062v1-abstract-full').style.display = 'inline'; document.getElementById('2311.15062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15062v1-abstract-full" style="display: none;"> This paper investigates an integrated sensing and communication (ISAC) system with reconfigurable intelligent surface (RIS). Our simultaneous beam training and target sensing (SBTTS) scheme enables the base station to perform beam training with the user terminals (UTs) and the RIS, and simultaneously to sense the targets. Based on our findings, the energy of the echoes from the RIS is accumulated in the angle-delay domain while that from the targets is accumulated in the Doppler-delay domain. The SBTTS scheme can distinguish the RIS from the targets with the mixed echoes from the RIS and the targets. Then we propose a positioning and array orientation estimation (PAOE) scheme for both the line-of-sight channels and the non-line-of-sight channels based on the beam training results of SBTTS by developing a low-complexity two-dimensional fast search algorithm. Based on the SBTTS and PAOE schemes, we further compute the angle-of-arrival and angle-of-departure for the channels between the RIS and the UTs by exploiting the geometry relationship to accomplish the beam alignment of the ISAC system. Simulation results verify the effectiveness of the proposed schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15062v1-abstract-full').style.display = 'none'; document.getElementById('2311.15062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15060">arXiv:2311.15060</a> <span> [<a href="https://arxiv.org/pdf/2311.15060">pdf</a>, <a href="https://arxiv.org/ps/2311.15060">ps</a>, <a href="https://arxiv.org/format/2311.15060">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Key Issues in Wireless Transmission for NTN-Assisted Internet of Things </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenhao Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+L">Leyi Lyu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+L">Lei Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jinming Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15060v1-abstract-short" style="display: inline;"> Non-terrestrial networks (NTNs) have become appealing resolutions for seamless coverage in the next-generation wireless transmission, where a large number of Internet of Things (IoT) devices diversely distributed can be efficiently served. The explosively growing number of IoT devices brings a new challenge for massive connection. The long-distance wireless signal propagation in NTNs leads to seve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15060v1-abstract-full').style.display = 'inline'; document.getElementById('2311.15060v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15060v1-abstract-full" style="display: none;"> Non-terrestrial networks (NTNs) have become appealing resolutions for seamless coverage in the next-generation wireless transmission, where a large number of Internet of Things (IoT) devices diversely distributed can be efficiently served. The explosively growing number of IoT devices brings a new challenge for massive connection. The long-distance wireless signal propagation in NTNs leads to severe path loss and large latency, where the accurate acquisition of channel state information (CSI) is another challenge, especially for fast-moving non-terrestrial base stations (NTBSs). Moreover, the scarcity of on-board resources of NTBSs is also a challenge for resource allocation. To this end, we investigate three key issues, where the existing schemes and emerging resolutions for these three key issues have been comprehensively presented. The first issue is to enable the massive connection by designing random access to establish the wireless link and multiple access to transmit data streams. The second issue is to accurately acquire CSI in various channel conditions by channel estimation and beam training, where orthogonal time frequency space modulation and dynamic codebooks are on focus. The third issue is to efficiently allocate the wireless resources, including power allocation, spectrum sharing, beam hopping, and beamforming. At the end of this article, some future research topics are identified. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15060v1-abstract-full').style.display = 'none'; document.getElementById('2311.15060v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.18451">arXiv:2310.18451</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Fusion of the Power from Citations: Enhance your Influence by Integrating Information from References </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Cong Qi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qin Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.18451v2-abstract-short" style="display: inline;"> Influence prediction plays a crucial role in the academic community. The amount of scholars' influence determines whether their work will be accepted by others. Most existing research focuses on predicting one paper's citation count after a period or identifying the most influential papers among the massive candidates, without concentrating on an individual paper's negative or positive impact on i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.18451v2-abstract-full').style.display = 'inline'; document.getElementById('2310.18451v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.18451v2-abstract-full" style="display: none;"> Influence prediction plays a crucial role in the academic community. The amount of scholars' influence determines whether their work will be accepted by others. Most existing research focuses on predicting one paper's citation count after a period or identifying the most influential papers among the massive candidates, without concentrating on an individual paper's negative or positive impact on its authors. Thus, this study aims to formulate the prediction problem to identify whether one paper can increase scholars' influence or not, which can provide feedback to the authors before they publish their papers. First, we presented the self-adapted ACC (Average Annual Citation Counts) metric to measure authors' impact yearly based on their annual published papers, paper citation counts, and contributions in each paper. Then, we proposed the RD-GAT (Reference-Depth Graph Attention Network) model to integrate heterogeneous graph information from different depth of references by assigning attention coefficients on them. Experiments on AMiner dataset demonstrated that the proposed ACC metrics could represent the authors influence effectively, and the RD-GAT model is more efficiently on the academic citation network, and have stronger robustness against the overfitting problem compared with the baseline models. By applying the framework in this work, scholars can identify whether their papers can improve their influence in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.18451v2-abstract-full').style.display = 'none'; document.getElementById('2310.18451v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">There is a problem in section 3</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05163">arXiv:2310.05163</a> <span> [<a href="https://arxiv.org/pdf/2310.05163">pdf</a>, <a href="https://arxiv.org/format/2310.05163">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> An Investigation of LLMs' Inefficacy in Understanding Converse Relations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chengwen Qi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bowen Li</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+B">Binyuan Hui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bailin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinyang Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinwang Wu</a>, <a href="/search/cs?searchtype=author&query=Laili%2C+Y">Yuanjun Laili</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05163v3-abstract-short" style="display: inline;"> Large Language Models (LLMs) have achieved remarkable success in many formal language oriented tasks, such as structural data-to-text and semantic parsing. However current benchmarks mostly follow the data distribution of the pre-training data of LLMs. Therefore, a natural question rises that do LLMs really understand the structured semantics of formal languages. In this paper, we investigate this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05163v3-abstract-full').style.display = 'inline'; document.getElementById('2310.05163v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05163v3-abstract-full" style="display: none;"> Large Language Models (LLMs) have achieved remarkable success in many formal language oriented tasks, such as structural data-to-text and semantic parsing. However current benchmarks mostly follow the data distribution of the pre-training data of LLMs. Therefore, a natural question rises that do LLMs really understand the structured semantics of formal languages. In this paper, we investigate this problem on a special case, converse binary relation. We introduce a new benchmark ConvRe focusing on converse relations, which contains 17 relations and 1240 triples extracted from popular knowledge graph completion datasets. Our ConvRE features two tasks, Re2Text and Text2Re, which are formulated as multi-choice question answering to evaluate LLMs' ability to determine the matching between relations and associated text. For the evaluation protocol, apart from different prompting methods, we further introduce variants to the test text and few-shot example text. We conduct experiments on three popular LLM families and have observed various scaling trends. The results suggest that LLMs often resort to shortcut learning and still face challenges on our proposed benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05163v3-abstract-full').style.display = 'none'; document.getElementById('2310.05163v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.00156">arXiv:2310.00156</a> <span> [<a href="https://arxiv.org/pdf/2310.00156">pdf</a>, <a href="https://arxiv.org/format/2310.00156">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning Generalizable Tool-use Skills through Trajectory Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Carl Qi</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yilin Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lifan Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haoyue Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bowen Jiang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xingyu Lin</a>, <a href="/search/cs?searchtype=author&query=Held%2C+D">David Held</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.00156v5-abstract-short" style="display: inline;"> Autonomous systems that efficiently utilize tools can assist humans in completing many common tasks such as cooking and cleaning. However, current systems fall short of matching human-level of intelligence in terms of adapting to novel tools. Prior works based on affordance often make strong assumptions about the environments and cannot scale to more complex, contact-rich tasks. In this work, we t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00156v5-abstract-full').style.display = 'inline'; document.getElementById('2310.00156v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.00156v5-abstract-full" style="display: none;"> Autonomous systems that efficiently utilize tools can assist humans in completing many common tasks such as cooking and cleaning. However, current systems fall short of matching human-level of intelligence in terms of adapting to novel tools. Prior works based on affordance often make strong assumptions about the environments and cannot scale to more complex, contact-rich tasks. In this work, we tackle this challenge and explore how agents can learn to use previously unseen tools to manipulate deformable objects. We propose to learn a generative model of the tool-use trajectories as a sequence of tool point clouds, which generalizes to different tool shapes. Given any novel tool, we first generate a tool-use trajectory and then optimize the sequence of tool poses to align with the generated trajectory. We train a single model on four different challenging deformable object manipulation tasks, using demonstration data from only one tool per task. The model generalizes to various novel tools, significantly outperforming baselines. We further test our trained policy in the real world with unseen tools, where it achieves the performance comparable to human. Additional materials can be found on our project website: https://sites.google.com/view/toolgen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00156v5-abstract-full').style.display = 'none'; document.getElementById('2310.00156v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.9 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14491">arXiv:2309.14491</a> <span> [<a href="https://arxiv.org/pdf/2309.14491">pdf</a>, <a href="https://arxiv.org/format/2309.14491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised 3D Perception with 2D Vision-Language Distillation for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Najibi%2C+M">Mahyar Najibi</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jingwei Ji</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yin Zhou</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C+R">Charles R. Qi</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinchen Yan</a>, <a href="/search/cs?searchtype=author&query=Ettinger%2C+S">Scott Ettinger</a>, <a href="/search/cs?searchtype=author&query=Anguelov%2C+D">Dragomir Anguelov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14491v1-abstract-short" style="display: inline;"> Closed-set 3D perception models trained on only a pre-defined set of object categories can be inadequate for safety critical applications such as autonomous driving where new object types can be encountered after deployment. In this paper, we present a multi-modal auto labeling pipeline capable of generating amodal 3D bounding boxes and tracklets for training models on open-set categories without… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14491v1-abstract-full').style.display = 'inline'; document.getElementById('2309.14491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14491v1-abstract-full" style="display: none;"> Closed-set 3D perception models trained on only a pre-defined set of object categories can be inadequate for safety critical applications such as autonomous driving where new object types can be encountered after deployment. In this paper, we present a multi-modal auto labeling pipeline capable of generating amodal 3D bounding boxes and tracklets for training models on open-set categories without 3D human labels. Our pipeline exploits motion cues inherent in point cloud sequences in combination with the freely available 2D image-text pairs to identify and track all traffic participants. Compared to the recent studies in this domain, which can only provide class-agnostic auto labels limited to moving objects, our method can handle both static and moving objects in the unsupervised manner and is able to output open-vocabulary semantic labels thanks to the proposed vision-language knowledge distillation. Experiments on the Waymo Open Dataset show that our approach outperforms the prior work by significant margins on various unsupervised 3D perception tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14491v1-abstract-full').style.display = 'none'; document.getElementById('2309.14491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.02563">arXiv:2309.02563</a> <span> [<a href="https://arxiv.org/pdf/2309.02563">pdf</a>, <a href="https://arxiv.org/format/2309.02563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Evaluation Kidney Layer Segmentation on Whole Slide Imaging using Convolutional Neural Networks and Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+M">Muhao Liu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+S">Shunxing Bao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Quan Liu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shilin Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haichun Yang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.02563v1-abstract-short" style="display: inline;"> The segmentation of kidney layer structures, including cortex, outer stripe, inner stripe, and inner medulla within human kidney whole slide images (WSI) plays an essential role in automated image analysis in renal pathology. However, the current manual segmentation process proves labor-intensive and infeasible for handling the extensive digital pathology images encountered at a large scale. In re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02563v1-abstract-full').style.display = 'inline'; document.getElementById('2309.02563v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.02563v1-abstract-full" style="display: none;"> The segmentation of kidney layer structures, including cortex, outer stripe, inner stripe, and inner medulla within human kidney whole slide images (WSI) plays an essential role in automated image analysis in renal pathology. However, the current manual segmentation process proves labor-intensive and infeasible for handling the extensive digital pathology images encountered at a large scale. In response, the realm of digital renal pathology has seen the emergence of deep learning-based methodologies. However, very few, if any, deep learning based approaches have been applied to kidney layer structure segmentation. Addressing this gap, this paper assesses the feasibility of performing deep learning based approaches on kidney layer structure segmetnation. This study employs the representative convolutional neural network (CNN) and Transformer segmentation approaches, including Swin-Unet, Medical-Transformer, TransUNet, U-Net, PSPNet, and DeepLabv3+. We quantitatively evaluated six prevalent deep learning models on renal cortex layer segmentation using mice kidney WSIs. The empirical results stemming from our approach exhibit compelling advancements, as evidenced by a decent Mean Intersection over Union (mIoU) index. The results demonstrate that Transformer models generally outperform CNN-based models. By enabling a quantitative evaluation of renal cortical structures, deep learning approaches are promising to empower these medical professionals to make more informed kidney layer segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02563v1-abstract-full').style.display = 'none'; document.getElementById('2309.02563v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.03014">arXiv:2308.03014</a> <span> [<a href="https://arxiv.org/pdf/2308.03014">pdf</a>, <a href="https://arxiv.org/format/2308.03014">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning Multiple Gaits within Latent Space for Quadruped Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinze Wu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Y">Yufei Xue</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenkun Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.03014v1-abstract-short" style="display: inline;"> Learning multiple gaits is non-trivial for legged robots, especially when encountering different terrains and velocity commands. In this work, we present an end-to-end training framework for learning multiple gaits for quadruped robots, tailored to the needs of robust locomotion, agile locomotion, and user's commands. A latent space is constructed concurrently by a gait encoder and a gait generato… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03014v1-abstract-full').style.display = 'inline'; document.getElementById('2308.03014v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.03014v1-abstract-full" style="display: none;"> Learning multiple gaits is non-trivial for legged robots, especially when encountering different terrains and velocity commands. In this work, we present an end-to-end training framework for learning multiple gaits for quadruped robots, tailored to the needs of robust locomotion, agile locomotion, and user's commands. A latent space is constructed concurrently by a gait encoder and a gait generator, which helps the agent to reuse multiple gait skills to achieve adaptive gait behaviors. To learn natural behaviors for multiple gaits, we design gait-dependent rewards that are constructed explicitly from gait parameters and implicitly from conditional adversarial motion priors (CAMP). We demonstrate such multiple gaits control on a quadruped robot Go1 with only proprioceptive sensors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03014v1-abstract-full').style.display = 'none'; document.getElementById('2308.03014v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.03206">arXiv:2306.03206</a> <span> [<a href="https://arxiv.org/pdf/2306.03206">pdf</a>, <a href="https://arxiv.org/format/2306.03206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MoDAR: Using Motion Forecasting for 3D Object Detection in Point Cloud Sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yingwei Li</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C+R">Charles R. Qi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yin Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenxi Liu</a>, <a href="/search/cs?searchtype=author&query=Anguelov%2C+D">Dragomir Anguelov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.03206v1-abstract-short" style="display: inline;"> Occluded and long-range objects are ubiquitous and challenging for 3D object detection. Point cloud sequence data provide unique opportunities to improve such cases, as an occluded or distant object can be observed from different viewpoints or gets better visibility over time. However, the efficiency and effectiveness in encoding long-term sequence data can still be improved. In this work, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03206v1-abstract-full').style.display = 'inline'; document.getElementById('2306.03206v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.03206v1-abstract-full" style="display: none;"> Occluded and long-range objects are ubiquitous and challenging for 3D object detection. Point cloud sequence data provide unique opportunities to improve such cases, as an occluded or distant object can be observed from different viewpoints or gets better visibility over time. However, the efficiency and effectiveness in encoding long-term sequence data can still be improved. In this work, we propose MoDAR, using motion forecasting outputs as a type of virtual modality, to augment LiDAR point clouds. The MoDAR modality propagates object information from temporal contexts to a target frame, represented as a set of virtual points, one for each object from a waypoint on a forecasted trajectory. A fused point cloud of both raw sensor points and the virtual points can then be fed to any off-the-shelf point-cloud based 3D object detector. Evaluated on the Waymo Open Dataset, our method significantly improves prior art detectors by using motion forecasting from extra-long sequences (e.g. 18 seconds), achieving new state of the arts, while not adding much computation overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03206v1-abstract-full').style.display = 'none'; document.getElementById('2306.03206v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.00926">arXiv:2306.00926</a> <span> [<a href="https://arxiv.org/pdf/2306.00926">pdf</a>, <a href="https://arxiv.org/format/2306.00926">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Inserting Anybody in Diffusion Models via Celeb Basis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+G">Ge Yuan</a>, <a href="/search/cs?searchtype=author&query=Cun%2C+X">Xiaodong Cun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Maomao Li</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xintao Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Ying Shan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Huicheng Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.00926v1-abstract-short" style="display: inline;"> Exquisite demand exists for customizing the pretrained large text-to-image model, $\textit{e.g.}$, Stable Diffusion, to generate innovative concepts, such as the users themselves. However, the newly-added concept from previous customization methods often shows weaker combination abilities than the original ones even given several images during training. We thus propose a new personalization method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00926v1-abstract-full').style.display = 'inline'; document.getElementById('2306.00926v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.00926v1-abstract-full" style="display: none;"> Exquisite demand exists for customizing the pretrained large text-to-image model, $\textit{e.g.}$, Stable Diffusion, to generate innovative concepts, such as the users themselves. However, the newly-added concept from previous customization methods often shows weaker combination abilities than the original ones even given several images during training. We thus propose a new personalization method that allows for the seamless integration of a unique individual into the pre-trained diffusion model using just $\textbf{one facial photograph}$ and only $\textbf{1024 learnable parameters}$ under $\textbf{3 minutes}$. So as we can effortlessly generate stunning images of this person in any pose or position, interacting with anyone and doing anything imaginable from text prompts. To achieve this, we first analyze and build a well-defined celeb basis from the embedding space of the pre-trained large text encoder. Then, given one facial photo as the target identity, we generate its own embedding by optimizing the weight of this basis and locking all other parameters. Empowered by the proposed celeb basis, the new identity in our customized model showcases a better concept combination ability than previous personalization methods. Besides, our model can also learn several new identities at once and interact with each other where the previous customization model fails to. The code will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00926v1-abstract-full').style.display = 'none'; document.getElementById('2306.00926v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: http://celeb-basis.github.io ; Github repository: https://github.com/ygtxr1997/CelebBasis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.03834">arXiv:2304.03834</a> <span> [<a href="https://arxiv.org/pdf/2304.03834">pdf</a>, <a href="https://arxiv.org/format/2304.03834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WOMD-LiDAR: Raw Sensor Dataset Benchmark for Motion Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kan Chen</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+R">Runzhou Ge</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+H">Hang Qiu</a>, <a href="/search/cs?searchtype=author&query=AI-Rfou%2C+R">Rami AI-Rfou</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C+R">Charles R. Qi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xuanyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zoey Yang</a>, <a href="/search/cs?searchtype=author&query=Ettinger%2C+S">Scott Ettinger</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+P">Pei Sun</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+Z">Zhaoqi Leng</a>, <a href="/search/cs?searchtype=author&query=Baniodeh%2C+M">Mustafa Baniodeh</a>, <a href="/search/cs?searchtype=author&query=Bogun%2C+I">Ivan Bogun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyue Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M">Mingxing Tan</a>, <a href="/search/cs?searchtype=author&query=Anguelov%2C+D">Dragomir Anguelov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.03834v2-abstract-short" style="display: inline;"> Widely adopted motion forecasting datasets substitute the observed sensory inputs with higher-level abstractions such as 3D boxes and polylines. These sparse shapes are inferred through annotating the original scenes with perception systems' predictions. Such intermediate representations tie the quality of the motion forecasting models to the performance of computer vision models. Moreover, the hu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03834v2-abstract-full').style.display = 'inline'; document.getElementById('2304.03834v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.03834v2-abstract-full" style="display: none;"> Widely adopted motion forecasting datasets substitute the observed sensory inputs with higher-level abstractions such as 3D boxes and polylines. These sparse shapes are inferred through annotating the original scenes with perception systems' predictions. Such intermediate representations tie the quality of the motion forecasting models to the performance of computer vision models. Moreover, the human-designed explicit interfaces between perception and motion forecasting typically pass only a subset of the semantic information present in the original sensory input. To study the effect of these modular approaches, design new paradigms that mitigate these limitations, and accelerate the development of end-to-end motion forecasting models, we augment the Waymo Open Motion Dataset (WOMD) with large-scale, high-quality, diverse LiDAR data for the motion forecasting task. The new augmented dataset WOMD-LiDAR consists of over 100,000 scenes that each spans 20 seconds, consisting of well-synchronized and calibrated high quality LiDAR point clouds captured across a range of urban and suburban geographies (https://waymo.com/open/data/motion/). Compared to Waymo Open Dataset (WOD), WOMD-LiDAR dataset contains 100x more scenes. Furthermore, we integrate the LiDAR data into the motion forecasting model training and provide a strong baseline. Experiments show that the LiDAR data brings improvement in the motion forecasting task. We hope that WOMD-LiDAR will provide new opportunities for boosting end-to-end motion forecasting models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03834v2-abstract-full').style.display = 'none'; document.getElementById('2304.03834v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICRA 2024 camera ready version. Dataset website: https://waymo.com/open/data/motion/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.02163">arXiv:2304.02163</a> <span> [<a href="https://arxiv.org/pdf/2304.02163">pdf</a>, <a href="https://arxiv.org/format/2304.02163">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> GINA-3D: Learning to Generate Implicit Neural Assets in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+B">Bokui Shen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinchen Yan</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C+R">Charles R. Qi</a>, <a href="/search/cs?searchtype=author&query=Najibi%2C+M">Mahyar Najibi</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+B">Boyang Deng</a>, <a href="/search/cs?searchtype=author&query=Guibas%2C+L">Leonidas Guibas</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yin Zhou</a>, <a href="/search/cs?searchtype=author&query=Anguelov%2C+D">Dragomir Anguelov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.02163v2-abstract-short" style="display: inline;"> Modeling the 3D world from sensor data for simulation is a scalable way of developing testing and validation environments for robotic learning problems such as autonomous driving. However, manually creating or re-creating real-world-like environments is difficult, expensive, and not scalable. Recent generative model techniques have shown promising progress to address such challenges by learning 3D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.02163v2-abstract-full').style.display = 'inline'; document.getElementById('2304.02163v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.02163v2-abstract-full" style="display: none;"> Modeling the 3D world from sensor data for simulation is a scalable way of developing testing and validation environments for robotic learning problems such as autonomous driving. However, manually creating or re-creating real-world-like environments is difficult, expensive, and not scalable. Recent generative model techniques have shown promising progress to address such challenges by learning 3D assets using only plentiful 2D images -- but still suffer limitations as they leverage either human-curated image datasets or renderings from manually-created synthetic 3D environments. In this paper, we introduce GINA-3D, a generative model that uses real-world driving data from camera and LiDAR sensors to create realistic 3D implicit neural assets of diverse vehicles and pedestrians. Compared to the existing image datasets, the real-world driving setting poses new challenges due to occlusions, lighting-variations and long-tail distributions. GINA-3D tackles these challenges by decoupling representation learning and generative modeling into two stages with a learned tri-plane latent structure, inspired by recent advances in generative modeling of images. To evaluate our approach, we construct a large-scale object-centric dataset containing over 1.2M images of vehicles and pedestrians from the Waymo Open Dataset, and a new set of 80K images of long-tail instances such as construction equipment, garbage trucks, and cable cars. We compare our model with existing approaches and demonstrate that it achieves state-of-the-art performance in quality and diversity for both generated images and geometries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.02163v2-abstract-full').style.display = 'none'; document.getElementById('2304.02163v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2023; Our WOD-ObjectAsset can be accessed through waymo.com/open</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.01064">arXiv:2304.01064</a> <span> [<a href="https://arxiv.org/pdf/2304.01064">pdf</a>, <a href="https://arxiv.org/format/2304.01064">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Real-time 6K Image Rescaling with Rate-distortion Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+K+L">Ka Leong Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.01064v2-abstract-short" style="display: inline;"> Contemporary image rescaling aims at embedding a high-resolution (HR) image into a low-resolution (LR) thumbnail image that contains embedded information for HR image reconstruction. Unlike traditional image super-resolution, this enables high-fidelity HR image restoration faithful to the original one, given the embedded information in the LR thumbnail. However, state-of-the-art image rescaling me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01064v2-abstract-full').style.display = 'inline'; document.getElementById('2304.01064v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.01064v2-abstract-full" style="display: none;"> Contemporary image rescaling aims at embedding a high-resolution (HR) image into a low-resolution (LR) thumbnail image that contains embedded information for HR image reconstruction. Unlike traditional image super-resolution, this enables high-fidelity HR image restoration faithful to the original one, given the embedded information in the LR thumbnail. However, state-of-the-art image rescaling methods do not optimize the LR image file size for efficient sharing and fall short of real-time performance for ultra-high-resolution (e.g., 6K) image reconstruction. To address these two challenges, we propose a novel framework (HyperThumbnail) for real-time 6K rate-distortion-aware image rescaling. Our framework first embeds an HR image into a JPEG LR thumbnail by an encoder with our proposed quantization prediction module, which minimizes the file size of the embedding LR JPEG thumbnail while maximizing HR reconstruction quality. Then, an efficient frequency-aware decoder reconstructs a high-fidelity HR image from the LR one in real time. Extensive experiments demonstrate that our framework outperforms previous image rescaling baselines in rate-distortion performance and can perform 6K image reconstruction in real time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01064v2-abstract-full').style.display = 'none'; document.getElementById('2304.01064v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2023; Github Repository: https://github.com/AbnerVictor/HyperThumbnail</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.15659">arXiv:2303.15659</a> <span> [<a href="https://arxiv.org/pdf/2303.15659">pdf</a>, <a href="https://arxiv.org/format/2303.15659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Geophysics">physics.geo-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.advwatres.2023.104448">10.1016/j.advwatres.2023.104448 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Boundary-to-Solution Mapping for Groundwater Flows in a Toth Basin </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jingwei Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yonghong Hao</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Cuiting Qi</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chunmei Ma</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Huazhi Sun</a>, <a href="/search/cs?searchtype=author&query=Begashaw%2C+N">Negash Begashaw</a>, <a href="/search/cs?searchtype=author&query=Comet%2C+G">Gurcan Comet</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yi Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.15659v1-abstract-short" style="display: inline;"> In this paper, the authors propose a new approach to solving the groundwater flow equation in the Toth basin of arbitrary top and bottom topographies using deep learning. Instead of using traditional numerical solvers, they use a DeepONet to produce the boundary-to-solution mapping. This mapping takes the geometry of the physical domain along with the boundary conditions as inputs to output the st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.15659v1-abstract-full').style.display = 'inline'; document.getElementById('2303.15659v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.15659v1-abstract-full" style="display: none;"> In this paper, the authors propose a new approach to solving the groundwater flow equation in the Toth basin of arbitrary top and bottom topographies using deep learning. Instead of using traditional numerical solvers, they use a DeepONet to produce the boundary-to-solution mapping. This mapping takes the geometry of the physical domain along with the boundary conditions as inputs to output the steady state solution of the groundwater flow equation. To implement the DeepONet, the authors approximate the top and bottom boundaries using truncated Fourier series or piecewise linear representations. They present two different implementations of the DeepONet: one where the Toth basin is embedded in a rectangular computational domain, and another where the Toth basin with arbitrary top and bottom boundaries is mapped into a rectangular computational domain via a nonlinear transformation. They implement the DeepONet with respect to the Dirichlet and Robin boundary condition at the top and the Neumann boundary condition at the impervious bottom boundary, respectively. Using this deep-learning enabled tool, the authors investigate the impact of surface topography on the flow pattern by both the top surface and the bottom impervious boundary with arbitrary geometries. They discover that the average slope of the top surface promotes long-distance transport, while the local curvature controls localized circulations. Additionally, they find that the slope of the bottom impervious boundary can seriously impact the long-distance transport of groundwater flows. Overall, this paper presents a new and innovative approach to solving the groundwater flow equation using deep learning, which allows for the investigation of the impact of surface topography on groundwater flow patterns. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.15659v1-abstract-full').style.display = 'none'; document.getElementById('2303.15659v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.10271">arXiv:2303.10271</a> <span> [<a href="https://arxiv.org/pdf/2303.10271">pdf</a>, <a href="https://arxiv.org/format/2303.10271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> VPU-EM: An Event-based Modeling Framework to Evaluate NPU Performance and Power Efficiency at Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Charles Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yang Lu</a>, <a href="/search/cs?searchtype=author&query=Subramanian%2C+S+S">Shiva Shankar Subramanian</a>, <a href="/search/cs?searchtype=author&query=Cahill%2C+F">Finola Cahill</a>, <a href="/search/cs?searchtype=author&query=Tuohy%2C+C">Conall Tuohy</a>, <a href="/search/cs?searchtype=author&query=Li%2C+V">Victor Li</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+X">Xu Qian</a>, <a href="/search/cs?searchtype=author&query=Crews%2C+D">Darren Crews</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Ling Wang</a>, <a href="/search/cs?searchtype=author&query=Roy%2C+S">Shivaji Roy</a>, <a href="/search/cs?searchtype=author&query=Deidda%2C+A">Andrea Deidda</a>, <a href="/search/cs?searchtype=author&query=Power%2C+M">Martin Power</a>, <a href="/search/cs?searchtype=author&query=Hanrahan%2C+N">Niall Hanrahan</a>, <a href="/search/cs?searchtype=author&query=Richmond%2C+R">Rick Richmond</a>, <a href="/search/cs?searchtype=author&query=Cheema%2C+U">Umer Cheema</a>, <a href="/search/cs?searchtype=author&query=Raha%2C+A">Arnab Raha</a>, <a href="/search/cs?searchtype=author&query=Palla%2C+A">Alessandro Palla</a>, <a href="/search/cs?searchtype=author&query=Baugh%2C+G">Gary Baugh</a>, <a href="/search/cs?searchtype=author&query=Mathaikutty%2C+D">Deepak Mathaikutty</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.10271v1-abstract-short" style="display: inline;"> State-of-art NPUs are typically architected as a self-contained sub-system with multiple heterogeneous hardware computing modules, and a dataflow-driven programming model. There lacks well-established methodology and tools in the industry to evaluate and compare the performance of NPUs from different architectures. We present an event-based performance modeling framework, VPU-EM, targeting scalabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10271v1-abstract-full').style.display = 'inline'; document.getElementById('2303.10271v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.10271v1-abstract-full" style="display: none;"> State-of-art NPUs are typically architected as a self-contained sub-system with multiple heterogeneous hardware computing modules, and a dataflow-driven programming model. There lacks well-established methodology and tools in the industry to evaluate and compare the performance of NPUs from different architectures. We present an event-based performance modeling framework, VPU-EM, targeting scalable performance evaluation of modern NPUs across diversified AI workloads. The framework adopts high-level event-based system-simulation methodology to abstract away design details for speed, while maintaining hardware pipelining, concurrency and interaction with software task scheduling. It is natively developed in Python and built to interface directly with AI frameworks such as Tensorflow, PyTorch, ONNX and OpenVINO, linking various in-house NPU graph compilers to achieve optimized full model performance. Furthermore, VPU-EM also provides the capability to model power characteristics of NPU in Power-EM mode to enable joint performance/power analysis. Using VPU-EM, we conduct performance/power analysis of models from representative neural network architecture. We demonstrate that even though this framework is developed for Intel VPU, an Intel in-house NPU IP technology, the methodology can be generalized for analysis of modern NPUs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10271v1-abstract-full').style.display = 'none'; document.getElementById('2303.10271v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 9 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> B.2.2; B.8.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.09535">arXiv:2303.09535</a> <span> [<a href="https://arxiv.org/pdf/2303.09535">pdf</a>, <a href="https://arxiv.org/format/2303.09535">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FateZero: Fusing Attentions for Zero-shot Text-based Video Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Cun%2C+X">Xiaodong Cun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+C">Chenyang Lei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xintao Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Ying Shan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.09535v3-abstract-short" style="display: inline;"> The diffusion-based generative models have achieved remarkable success in text-based image generation. However, since it contains enormous randomness in generation progress, it is still challenging to apply such models for real-world visual content editing, especially in videos. In this paper, we propose FateZero, a zero-shot text-based editing method on real-world videos without per-prompt traini… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09535v3-abstract-full').style.display = 'inline'; document.getElementById('2303.09535v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.09535v3-abstract-full" style="display: none;"> The diffusion-based generative models have achieved remarkable success in text-based image generation. However, since it contains enormous randomness in generation progress, it is still challenging to apply such models for real-world visual content editing, especially in videos. In this paper, we propose FateZero, a zero-shot text-based editing method on real-world videos without per-prompt training or use-specific mask. To edit videos consistently, we propose several techniques based on the pre-trained models. Firstly, in contrast to the straightforward DDIM inversion technique, our approach captures intermediate attention maps during inversion, which effectively retain both structural and motion information. These maps are directly fused in the editing process rather than generated during denoising. To further minimize semantic leakage of the source video, we then fuse self-attentions with a blending mask obtained by cross-attention features from the source prompt. Furthermore, we have implemented a reform of the self-attention mechanism in denoising UNet by introducing spatial-temporal attention to ensure frame consistency. Yet succinct, our method is the first one to show the ability of zero-shot text-driven video style and local attribute editing from the trained text-to-image model. We also have a better zero-shot shape-aware editing ability based on the text-to-video model. Extensive experiments demonstrate our superior temporal consistency and editing capability than previous works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09535v3-abstract-full').style.display = 'none'; document.getElementById('2303.09535v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV 2023 as an Oral Presentation. Project page: https://fate-zero-edit.github.io ; GitHub repository: https://github.com/ChenyangQiQi/FateZero</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.10425">arXiv:2302.10425</a> <span> [<a href="https://arxiv.org/pdf/2302.10425">pdf</a>, <a href="https://arxiv.org/format/2302.10425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCSVT.2023.3289885">10.1109/TCSVT.2023.3289885 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Instance-incremental Scene Graph Generation from Real-world Point Clouds via Normalizing Flows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chao Qi</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianqin Yin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jinghang Xu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+P">Pengxiang Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.10425v2-abstract-short" style="display: inline;"> This work introduces a new task of instance-incremental scene graph generation: Given a scene of the point cloud, representing it as a graph and automatically increasing novel instances. A graph denoting the object layout of the scene is finally generated. It is an important task since it helps to guide the insertion of novel 3D objects into a real-world scene in vision-based applications like aug… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10425v2-abstract-full').style.display = 'inline'; document.getElementById('2302.10425v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.10425v2-abstract-full" style="display: none;"> This work introduces a new task of instance-incremental scene graph generation: Given a scene of the point cloud, representing it as a graph and automatically increasing novel instances. A graph denoting the object layout of the scene is finally generated. It is an important task since it helps to guide the insertion of novel 3D objects into a real-world scene in vision-based applications like augmented reality. It is also challenging because the complexity of the real-world point cloud brings difficulties in learning object layout experiences from the observation data (non-empty rooms with labeled semantics). We model this task as a conditional generation problem and propose a 3D autoregressive framework based on normalizing flows (3D-ANF) to address it. First, we represent the point cloud as a graph by extracting the label semantics and contextual relationships. Next, a model based on normalizing flows is introduced to map the conditional generation of graphic elements into the Gaussian process. The mapping is invertible. Thus, the real-world experiences represented in the observation data can be modeled in the training phase, and novel instances can be autoregressively generated based on the Gaussian process in the testing phase. To evaluate the performance of our method sufficiently, we implement this new task on the indoor benchmark dataset 3DSSG-O27R16 and our newly proposed graphical dataset of outdoor scenes GPL3D. Experiments show that our method generates reliable novel graphs from the real-world point cloud and achieves state-of-the-art performance on the datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10425v2-abstract-full').style.display = 'none'; document.getElementById('2302.10425v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE TCSVT. The supplementary material is available in the media column of the journal version of the article</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.02969">arXiv:2301.02969</a> <span> [<a href="https://arxiv.org/pdf/2301.02969">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-scale multi-modal micro-expression recognition algorithm based on transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fengping Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jie Li</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chun Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.02969v2-abstract-short" style="display: inline;"> A micro-expression is a spontaneous unconscious facial muscle movement that can reveal the true emotions people attempt to hide. Although manual methods have made good progress and deep learning is gaining prominence. Due to the short duration of micro-expression and different scales of expressed in facial regions, existing algorithms cannot extract multi-modal multi-scale facial region features w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.02969v2-abstract-full').style.display = 'inline'; document.getElementById('2301.02969v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.02969v2-abstract-full" style="display: none;"> A micro-expression is a spontaneous unconscious facial muscle movement that can reveal the true emotions people attempt to hide. Although manual methods have made good progress and deep learning is gaining prominence. Due to the short duration of micro-expression and different scales of expressed in facial regions, existing algorithms cannot extract multi-modal multi-scale facial region features while taking into account contextual information to learn underlying features. Therefore, in order to solve the above problems, a multi-modal multi-scale algorithm based on transformer network is proposed in this paper, aiming to fully learn local multi-grained features of micro-expressions through two modal features of micro-expressions - motion features and texture features. To obtain local area features of the face at different scales, we learned patch features at different scales for both modalities, and then fused multi-layer multi-headed attention weights to obtain effective features by weighting the patch features, and combined cross-modal contrastive learning for model optimization. We conducted comprehensive experiments on three spontaneous datasets, and the results show the accuracy of the proposed algorithm in single measurement SMIC database is up to 78.73% and the F1 value on CASMEII of the combined database is up to 0.9071, which is at the leading level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.02969v2-abstract-full').style.display = 'none'; document.getElementById('2301.02969v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08062">arXiv:2212.08062</a> <span> [<a href="https://arxiv.org/pdf/2212.08062">pdf</a>, <a href="https://arxiv.org/format/2212.08062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MetaPortrait: Identity-Preserving Talking Head Generation with Fast Personalized Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenyang Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">HsiangTao Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yong Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+F">Fang Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08062v3-abstract-short" style="display: inline;"> In this work, we propose an ID-preserving talking head generation framework, which advances previous methods in two aspects. First, as opposed to interpolating from sparse flow, we claim that dense landmarks are crucial to achieving accurate geometry-aware flow fields. Second, inspired by face-swapping methods, we adaptively fuse the source identity during synthesis, so that the network better pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08062v3-abstract-full').style.display = 'inline'; document.getElementById('2212.08062v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08062v3-abstract-full" style="display: none;"> In this work, we propose an ID-preserving talking head generation framework, which advances previous methods in two aspects. First, as opposed to interpolating from sparse flow, we claim that dense landmarks are crucial to achieving accurate geometry-aware flow fields. Second, inspired by face-swapping methods, we adaptively fuse the source identity during synthesis, so that the network better preserves the key characteristics of the image portrait. Although the proposed model surpasses prior generation fidelity on established benchmarks, to further make the talking head generation qualified for real usage, personalized fine-tuning is usually needed. However, this process is rather computationally demanding that is unaffordable to standard users. To solve this, we propose a fast adaptation model using a meta-learning approach. The learned model can be adapted to a high-quality personalized model as fast as 30 seconds. Last but not the least, a spatial-temporal enhancement module is proposed to improve the fine details while ensuring temporal coherency. Extensive experiments prove the significant superiority of our approach over the state of the arts in both one-shot and personalized settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08062v3-abstract-full').style.display = 'none'; document.getElementById('2212.08062v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2023, project page: https://meta-portrait.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.03267">arXiv:2212.03267</a> <span> [<a href="https://arxiv.org/pdf/2212.03267">pdf</a>, <a href="https://arxiv.org/format/2212.03267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeRDi: Single-View NeRF Synthesis with Language-Guided Diffusion as General Image Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+C">Congyue Deng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C+%22">Chiyu "Max'' Jiang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C+R">Charles R. Qi</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinchen Yan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yin Zhou</a>, <a href="/search/cs?searchtype=author&query=Guibas%2C+L">Leonidas Guibas</a>, <a href="/search/cs?searchtype=author&query=Anguelov%2C+D">Dragomir Anguelov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.03267v1-abstract-short" style="display: inline;"> 2D-to-3D reconstruction is an ill-posed problem, yet humans are good at solving this problem due to their prior knowledge of the 3D world developed over years. Driven by this observation, we propose NeRDi, a single-view NeRF synthesis framework with general image priors from 2D diffusion models. Formulating single-view reconstruction as an image-conditioned 3D generation problem, we optimize the N… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03267v1-abstract-full').style.display = 'inline'; document.getElementById('2212.03267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.03267v1-abstract-full" style="display: none;"> 2D-to-3D reconstruction is an ill-posed problem, yet humans are good at solving this problem due to their prior knowledge of the 3D world developed over years. Driven by this observation, we propose NeRDi, a single-view NeRF synthesis framework with general image priors from 2D diffusion models. Formulating single-view reconstruction as an image-conditioned 3D generation problem, we optimize the NeRF representations by minimizing a diffusion loss on its arbitrary view renderings with a pretrained image diffusion model under the input-view constraint. We leverage off-the-shelf vision-language models and introduce a two-section language guidance as conditioning inputs to the diffusion model. This is essentially helpful for improving multiview content coherence as it narrows down the general image prior conditioned on the semantic and visual features of the single-view input image. Additionally, we introduce a geometric loss based on estimated depth maps to regularize the underlying 3D geometry of the NeRF. Experimental results on the DTU MVS dataset show that our method can synthesize novel views with higher quality even compared to existing methods trained on this dataset. We also demonstrate our generalizability in zero-shot NeRF synthesis for in-the-wild images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03267v1-abstract-full').style.display = 'none'; document.getElementById('2212.03267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.15751">arXiv:2210.15751</a> <span> [<a href="https://arxiv.org/pdf/2210.15751">pdf</a>, <a href="https://arxiv.org/format/2210.15751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Planning with Spatial-Temporal Abstraction from Point Clouds for Deformable Object Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xingyu Lin</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Carl Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yunchu Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhiao Huang</a>, <a href="/search/cs?searchtype=author&query=Fragkiadaki%2C+K">Katerina Fragkiadaki</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunzhu Li</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+C">Chuang Gan</a>, <a href="/search/cs?searchtype=author&query=Held%2C+D">David Held</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.15751v2-abstract-short" style="display: inline;"> Effective planning of long-horizon deformable object manipulation requires suitable abstractions at both the spatial and temporal levels. Previous methods typically either focus on short-horizon tasks or make strong assumptions that full-state information is available, which prevents their use on deformable objects. In this paper, we propose PlAnning with Spatial-Temporal Abstraction (PASTA), whic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15751v2-abstract-full').style.display = 'inline'; document.getElementById('2210.15751v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.15751v2-abstract-full" style="display: none;"> Effective planning of long-horizon deformable object manipulation requires suitable abstractions at both the spatial and temporal levels. Previous methods typically either focus on short-horizon tasks or make strong assumptions that full-state information is available, which prevents their use on deformable objects. In this paper, we propose PlAnning with Spatial-Temporal Abstraction (PASTA), which incorporates both spatial abstraction (reasoning about objects and their relations to each other) and temporal abstraction (reasoning over skills instead of low-level actions). Our framework maps high-dimension 3D observations such as point clouds into a set of latent vectors and plans over skill sequences on top of the latent set representation. We show that our method can effectively perform challenging sequential deformable object manipulation tasks in the real world, which require combining multiple tool-use skills such as cutting with a knife, pushing with a pusher, and spreading the dough with a roller. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15751v2-abstract-full').style.display = 'none'; document.getElementById('2210.15751v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at the Conference on Robot Learning (CoRL 2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.15492">arXiv:2210.15492</a> <span> [<a href="https://arxiv.org/pdf/2210.15492">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Reconstruction of compressed spectral imaging based on global structure and spectral correlation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jie Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jieru Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lin Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chun Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.15492v2-abstract-short" style="display: inline;"> In this paper, a convolutional sparse coding method based on global structure characteristics and spectral correlation is proposed for the reconstruction of compressive spectral images. The spectral data is regarded as the convolution sum of the convolution kernel and the corresponding coefficients, using the convolution kernel operates the global image information, preserving the structure inform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15492v2-abstract-full').style.display = 'inline'; document.getElementById('2210.15492v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.15492v2-abstract-full" style="display: none;"> In this paper, a convolutional sparse coding method based on global structure characteristics and spectral correlation is proposed for the reconstruction of compressive spectral images. The spectral data is regarded as the convolution sum of the convolution kernel and the corresponding coefficients, using the convolution kernel operates the global image information, preserving the structure information of the spectral image in the spatial dimension. To take full exploration of the constraints between spectra, the coefficients corresponding to the convolution kernel are constrained by the L_(2,1)norm to improve spectral accuracy. And, to solve the problem that convolutional sparse coding is insensitive to low frequency, the global total-variation (TV) constraint is added to estimate the low-frequency components. It not only ensures the effective estimation of the low-frequency but also transforms the convolutional sparse coding into a de-noising process, which makes the reconstructing process simpler. Simulations show that compared with the current mainstream optimization methods, the proposed method can improve the reconstruction quality by up to 4 dB in PSNR and 10% in SSIM, and has a great improvement in the details of the reconstructed image. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15492v2-abstract-full').style.display = 'none'; document.getElementById('2210.15492v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.13729">arXiv:2210.13729</a> <span> [<a href="https://arxiv.org/pdf/2210.13729">pdf</a>, <a href="https://arxiv.org/format/2210.13729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Reinforced Medical Report Generation with M-Linear Attention and Repetition Penalty </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenting Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhenghua Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junyang Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chang Qi</a>, <a href="/search/cs?searchtype=author&query=Lukasiewicz%2C+T">Thomas Lukasiewicz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.13729v1-abstract-short" style="display: inline;"> To reduce doctors' workload, deep-learning-based automatic medical report generation has recently attracted more and more research efforts, where deep convolutional neural networks (CNNs) are employed to encode the input images, and recurrent neural networks (RNNs) are used to decode the visual features into medical reports automatically. However, these state-of-the-art methods mainly suffer from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.13729v1-abstract-full').style.display = 'inline'; document.getElementById('2210.13729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.13729v1-abstract-full" style="display: none;"> To reduce doctors' workload, deep-learning-based automatic medical report generation has recently attracted more and more research efforts, where deep convolutional neural networks (CNNs) are employed to encode the input images, and recurrent neural networks (RNNs) are used to decode the visual features into medical reports automatically. However, these state-of-the-art methods mainly suffer from three shortcomings: (i) incomprehensive optimization, (ii) low-order and unidimensional attention mechanisms, and (iii) repeated generation. In this article, we propose a hybrid reinforced medical report generation method with m-linear attention and repetition penalty mechanism (HReMRG-MR) to overcome these problems. Specifically, a hybrid reward with different weights is employed to remedy the limitations of single-metric-based rewards. We also propose a search algorithm with linear complexity to approximate the best weight combination. Furthermore, we use m-linear attention modules to explore high-order feature interactions and to achieve multi-modal reasoning, while a repetition penalty applies penalties to repeated terms during the model's training process. Extensive experimental studies on two public datasets show that HReMRG-MR greatly outperforms the state-of-the-art baselines in terms of all metrics. We also conducted a series of ablation experiments to prove the effectiveness of all our proposed components. We also performed a reward search toy experiment to give evidence that our proposed search approach can significantly reduce the search time while approximating the best performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.13729v1-abstract-full').style.display = 'none'; document.getElementById('2210.13729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is current under peer-review in IEEE TNNLS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.08375">arXiv:2210.08375</a> <span> [<a href="https://arxiv.org/pdf/2210.08375">pdf</a>, <a href="https://arxiv.org/format/2210.08375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving the Intra-class Long-tail in 3D Detection via Rare Example Mining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+C+M">Chiyu Max Jiang</a>, <a href="/search/cs?searchtype=author&query=Najibi%2C+M">Mahyar Najibi</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C+R">Charles R. Qi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yin Zhou</a>, <a href="/search/cs?searchtype=author&query=Anguelov%2C+D">Dragomir Anguelov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.08375v1-abstract-short" style="display: inline;"> Continued improvements in deep learning architectures have steadily advanced the overall performance of 3D object detectors to levels on par with humans for certain tasks and datasets, where the overall performance is mostly driven by common examples. However, even the best performing models suffer from the most naive mistakes when it comes to rare examples that do not appear frequently in the tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08375v1-abstract-full').style.display = 'inline'; document.getElementById('2210.08375v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.08375v1-abstract-full" style="display: none;"> Continued improvements in deep learning architectures have steadily advanced the overall performance of 3D object detectors to levels on par with humans for certain tasks and datasets, where the overall performance is mostly driven by common examples. However, even the best performing models suffer from the most naive mistakes when it comes to rare examples that do not appear frequently in the training data, such as vehicles with irregular geometries. Most studies in the long-tail literature focus on class-imbalanced classification problems with known imbalanced label counts per class, but they are not directly applicable to the intra-class long-tail examples in problems with large intra-class variations such as 3D object detection, where instances with the same class label can have drastically varied properties such as shapes and sizes. Other works propose to mitigate this problem using active learning based on the criteria of uncertainty, difficulty, or diversity. In this study, we identify a new conceptual dimension - rareness - to mine new data for improving the long-tail performance of models. We show that rareness, as opposed to difficulty, is the key to data-centric improvements for 3D detectors, since rareness is the result of a lack in data support while difficulty is related to the fundamental ambiguity in the problem. We propose a general and effective method to identify the rareness of objects based on density estimation in the feature space using flow models, and propose a principled cost-aware formulation for mining rare object tracks, which improves overall model performance, but more importantly - significantly improves the performance for rare objects (by 30.97\% <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08375v1-abstract-full').style.display = 'none'; document.getElementById('2210.08375v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to European Conference on Computer Vision (ECCV) 2022</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.08064">arXiv:2210.08064</a> <span> [<a href="https://arxiv.org/pdf/2210.08064">pdf</a>, <a href="https://arxiv.org/format/2210.08064">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LESS: Label-Efficient Semantic Segmentation for LiDAR Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghua Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yin Zhou</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C+R">Charles R. Qi</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+B">Boqing Gong</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a>, <a href="/search/cs?searchtype=author&query=Anguelov%2C+D">Dragomir Anguelov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.08064v1-abstract-short" style="display: inline;"> Semantic segmentation of LiDAR point clouds is an important task in autonomous driving. However, training deep models via conventional supervised methods requires large datasets which are costly to label. It is critical to have label-efficient segmentation approaches to scale up the model to new operational domains or to improve performance on rare cases. While most prior works focus on indoor sce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08064v1-abstract-full').style.display = 'inline'; document.getElementById('2210.08064v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.08064v1-abstract-full" style="display: none;"> Semantic segmentation of LiDAR point clouds is an important task in autonomous driving. However, training deep models via conventional supervised methods requires large datasets which are costly to label. It is critical to have label-efficient segmentation approaches to scale up the model to new operational domains or to improve performance on rare cases. While most prior works focus on indoor scenes, we are one of the first to propose a label-efficient semantic segmentation pipeline for outdoor scenes with LiDAR point clouds. Our method co-designs an efficient labeling process with semi/weakly supervised learning and is applicable to nearly any 3D semantic segmentation backbones. Specifically, we leverage geometry patterns in outdoor scenes to have a heuristic pre-segmentation to reduce the manual labeling and jointly design the learning targets with the labeling process. In the learning step, we leverage prototype learning to get more descriptive point embeddings and use multi-scan distillation to exploit richer semantics from temporally aggregated point clouds to boost the performance of single-scan models. Evaluated on the SemanticKITTI and the nuScenes datasets, we show that our proposed method outperforms existing label-efficient methods. With extremely limited human annotations (e.g., 0.1% point labels), our proposed method is even highly competitive compared to the fully supervised counterpart with 100% labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08064v1-abstract-full').style.display = 'none'; document.getElementById('2210.08064v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.08061">arXiv:2210.08061</a> <span> [<a href="https://arxiv.org/pdf/2210.08061">pdf</a>, <a href="https://arxiv.org/format/2210.08061">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Motion Inspired Unsupervised Perception and Prediction in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Najibi%2C+M">Mahyar Najibi</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jingwei Ji</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yin Zhou</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C+R">Charles R. Qi</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinchen Yan</a>, <a href="/search/cs?searchtype=author&query=Ettinger%2C+S">Scott Ettinger</a>, <a href="/search/cs?searchtype=author&query=Anguelov%2C+D">Dragomir Anguelov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.08061v1-abstract-short" style="display: inline;"> Learning-based perception and prediction modules in modern autonomous driving systems typically rely on expensive human annotation and are designed to perceive only a handful of predefined object categories. This closed-set paradigm is insufficient for the safety-critical autonomous driving task, where the autonomous vehicle needs to process arbitrarily many types of traffic participants and their… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08061v1-abstract-full').style.display = 'inline'; document.getElementById('2210.08061v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.08061v1-abstract-full" style="display: none;"> Learning-based perception and prediction modules in modern autonomous driving systems typically rely on expensive human annotation and are designed to perceive only a handful of predefined object categories. This closed-set paradigm is insufficient for the safety-critical autonomous driving task, where the autonomous vehicle needs to process arbitrarily many types of traffic participants and their motion behaviors in a highly dynamic world. To address this difficulty, this paper pioneers a novel and challenging direction, i.e., training perception and prediction models to understand open-set moving objects, with no human supervision. Our proposed framework uses self-learned flow to trigger an automated meta labeling pipeline to achieve automatic supervision. 3D detection experiments on the Waymo Open Dataset show that our method significantly outperforms classical unsupervised approaches and is even competitive to the counterpart with supervised scene flow. We further show that our approach generates highly promising results in open-set 3D detection and trajectory prediction, confirming its potential in closing the safety gap of fully supervised systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08061v1-abstract-full').style.display = 'none'; document.getElementById('2210.08061v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.05018">arXiv:2210.05018</a> <span> [<a href="https://arxiv.org/pdf/2210.05018">pdf</a>, <a href="https://arxiv.org/format/2210.05018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LidarNAS: Unifying and Searching Neural Architectures for 3D Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenxi Liu</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+Z">Zhaoqi Leng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+P">Pei Sun</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Shuyang Cheng</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C+R">Charles R. Qi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yin Zhou</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M">Mingxing Tan</a>, <a href="/search/cs?searchtype=author&query=Anguelov%2C+D">Dragomir Anguelov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.05018v1-abstract-short" style="display: inline;"> Developing neural models that accurately understand objects in 3D point clouds is essential for the success of robotics and autonomous driving. However, arguably due to the higher-dimensional nature of the data (as compared to images), existing neural architectures exhibit a large variety in their designs, including but not limited to the views considered, the format of the neural features, and th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.05018v1-abstract-full').style.display = 'inline'; document.getElementById('2210.05018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.05018v1-abstract-full" style="display: none;"> Developing neural models that accurately understand objects in 3D point clouds is essential for the success of robotics and autonomous driving. However, arguably due to the higher-dimensional nature of the data (as compared to images), existing neural architectures exhibit a large variety in their designs, including but not limited to the views considered, the format of the neural features, and the neural operations used. Lack of a unified framework and interpretation makes it hard to put these designs in perspective, as well as systematically explore new ones. In this paper, we begin by proposing a unified framework of such, with the key idea being factorizing the neural networks into a series of view transforms and neural layers. We demonstrate that this modular framework can reproduce a variety of existing works while allowing a fair comparison of backbone designs. Then, we show how this framework can easily materialize into a concrete neural architecture search (NAS) space, allowing a principled NAS-for-3D exploration. In performing evolutionary NAS on the 3D object detection task on the Waymo Open Dataset, not only do we outperform the state-of-the-art models, but also report the interesting finding that NAS tends to discover the same macro-level architecture concept for both the vehicle and pedestrian classes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.05018v1-abstract-full').style.display = 'none'; document.getElementById('2210.05018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.00714">arXiv:2208.00714</a> <span> [<a href="https://arxiv.org/pdf/2208.00714">pdf</a>, <a href="https://arxiv.org/format/2208.00714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Precoding for Mixture Use of Phase Shifters and Switches in mmWave Massive MIMO </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chenhao Qi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xianghao Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.00714v1-abstract-short" style="display: inline;"> A variable-phase-shifter (VPS) architecture with hybrid precoding for mixture use of phase shifters and switches, is proposed for millimeter wave massive multiple-input multiple-output communications. For the VPS architecture, a hybrid precoding design (HPD) scheme, called VPS-HPD, is proposed to optimize the phases according to the channel state information by alternately optimizing the analog pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00714v1-abstract-full').style.display = 'inline'; document.getElementById('2208.00714v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.00714v1-abstract-full" style="display: none;"> A variable-phase-shifter (VPS) architecture with hybrid precoding for mixture use of phase shifters and switches, is proposed for millimeter wave massive multiple-input multiple-output communications. For the VPS architecture, a hybrid precoding design (HPD) scheme, called VPS-HPD, is proposed to optimize the phases according to the channel state information by alternately optimizing the analog precoder and digital precoder. To reduce the computational complexity of the VPS-HPD scheme, a low-complexity HPD scheme for the VPS architecture (VPS-LC-HPD) including alternating optimization in three stages is then proposed, where each stage has a closed-form solution and can be efficiently implemented. To reduce the hardware complexity introduced by the large number of switches, we consider a group-connected VPS architecture and propose a HPD scheme, where the HPD problem is divided into multiple independent subproblems with each subproblem flexibly solved by the VPS-HPD or VPS-LC-HPD scheme. Simulation results verify the effectiveness of the propose schemes and show that the proposed schemes can achieve satisfactory spectral efficiency performance with reduced computational complexity or hardware complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00714v1-abstract-full').style.display = 'none'; document.getElementById('2208.00714v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Qi%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Qi%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Qi%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Qi%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository