Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 86 results for author: <span class="mathjax">Kang, G</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Kang%2C+G">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Kang, G"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Kang%2C+G&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Kang, G"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Kang%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Kang%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Kang%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06818">arXiv:2502.06818</a> <span> [<a href="https://arxiv.org/pdf/2502.06818">pdf</a>, <a href="https://arxiv.org/format/2502.06818">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Globality Strikes Back: Rethinking the Global Knowledge of CLIP in Training-Free Open-Vocabulary Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyun Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Cilin Yan</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06818v1-abstract-short" style="display: inline;"> Recent works modify CLIP to perform open-vocabulary semantic segmentation in a training-free manner (TF-OVSS). In CLIP, patch-wise image representations mainly encode the homogeneous image-level properties and thus are not discriminative enough, hindering its application to the dense prediction task. Previous works make image features more distinct across patches, through making each patch mainly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06818v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06818v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06818v1-abstract-full" style="display: none;"> Recent works modify CLIP to perform open-vocabulary semantic segmentation in a training-free manner (TF-OVSS). In CLIP, patch-wise image representations mainly encode the homogeneous image-level properties and thus are not discriminative enough, hindering its application to the dense prediction task. Previous works make image features more distinct across patches, through making each patch mainly attend to itself or the neighboring patches within a narrow local window. However, with their modifications, the ability of CLIP to aggregate global context information, which is known to be useful for distinguishing confusing categories, is largely weakened. In this paper, we propose a new method named GCLIP, which mines the beneficial global knowledge of CLIP to facilitate the TF-OVSS task. Firstly, we aim to equip the last-block attention with image-level properties while not introducing homogeneous attention patterns across patches. In GCLIP, we merge the attention from the global token emerging blocks with the Query-Query attention to realize this goal. Secondly, we aim to make the Value embeddings of the last-block attention module more distinct and semantically correlated. To realize this, we design a novel channel suppression strategy. As the representation of each patch is finally determined by the attention weights and the Value embeddings, our method can generate more discriminative patch-level image features while absorbing global context information. Extensive experiments on five standard benchmarks demonstrate that our method consistently outperforms previous state-of-the-arts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06818v1-abstract-full').style.display = 'none'; document.getElementById('2502.06818v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16714">arXiv:2501.16714</a> <span> [<a href="https://arxiv.org/pdf/2501.16714">pdf</a>, <a href="https://arxiv.org/format/2501.16714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Separate Motion from Appearance: Customizing Motion via Customizing Text-to-Video Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huijie Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyun Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shuai Ma</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jie Hu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xiaoming Wei</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16714v1-abstract-short" style="display: inline;"> Motion customization aims to adapt the diffusion model (DM) to generate videos with the motion specified by a set of video clips with the same motion concept. To realize this goal, the adaptation of DM should be possible to model the specified motion concept, without compromising the ability to generate diverse appearances. Thus, the key to solving this problem lies in how to separate the motion c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16714v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16714v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16714v1-abstract-full" style="display: none;"> Motion customization aims to adapt the diffusion model (DM) to generate videos with the motion specified by a set of video clips with the same motion concept. To realize this goal, the adaptation of DM should be possible to model the specified motion concept, without compromising the ability to generate diverse appearances. Thus, the key to solving this problem lies in how to separate the motion concept from the appearance in the adaptation process of DM. Typical previous works explore different ways to represent and insert a motion concept into large-scale pretrained text-to-video diffusion models, e.g., learning a motion LoRA, using latent noise residuals, etc. While those methods can encode the motion concept, they also inevitably encode the appearance in the reference videos, resulting in weakened appearance generation capability. In this paper, we follow the typical way to learn a motion LoRA to encode the motion concept, but propose two novel strategies to enhance motion-appearance separation, including temporal attention purification (TAP) and appearance highway (AH). Specifically, we assume that in the temporal attention module, the pretrained Value embeddings are sufficient to serve as basic components needed by producing a new motion. Thus, in TAP, we choose only to reshape the temporal attention with motion LoRAs so that Value embeddings can be reorganized to produce a new motion. Further, in AH, we alter the starting point of each skip connection in U-Net from the output of each temporal attention module to the output of each spatial attention module. Extensive experiments demonstrate that compared to previous works, our method can generate videos with appearance more aligned with the text descriptions and motion more consistent with the reference videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16714v1-abstract-full').style.display = 'none'; document.getElementById('2501.16714v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages,6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03122">arXiv:2501.03122</a> <span> [<a href="https://arxiv.org/pdf/2501.03122">pdf</a>, <a href="https://arxiv.org/format/2501.03122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Normalizing Batch Normalization for Long-Tailed Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yuxiang Bao</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linlin Yang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+X">Xiaoyue Duan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Bo Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Baochang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03122v1-abstract-short" style="display: inline;"> In real-world scenarios, the number of training samples across classes usually subjects to a long-tailed distribution. The conventionally trained network may achieve unexpected inferior performance on the rare class compared to the frequent class. Most previous works attempt to rectify the network bias from the data-level or from the classifier-level. Differently, in this paper, we identify that t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03122v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03122v1-abstract-full" style="display: none;"> In real-world scenarios, the number of training samples across classes usually subjects to a long-tailed distribution. The conventionally trained network may achieve unexpected inferior performance on the rare class compared to the frequent class. Most previous works attempt to rectify the network bias from the data-level or from the classifier-level. Differently, in this paper, we identify that the bias towards the frequent class may be encoded into features, i.e., the rare-specific features which play a key role in discriminating the rare class are much weaker than the frequent-specific features. Based on such an observation, we introduce a simple yet effective approach, normalizing the parameters of Batch Normalization (BN) layer to explicitly rectify the feature bias. To achieve this end, we represent the Weight/Bias parameters of a BN layer as a vector, normalize it into a unit one and multiply the unit vector by a scalar learnable parameter. Through decoupling the direction and magnitude of parameters in BN layer to learn, the Weight/Bias exhibits a more balanced distribution and thus the strength of features becomes more even. Extensive experiments on various long-tailed recognition benchmarks (i.e., CIFAR-10/100-LT, ImageNet-LT and iNaturalist 2018) show that our method outperforms previous state-of-the-arts remarkably. The code and checkpoints are available at https://github.com/yuxiangbao/NBN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03122v1-abstract-full').style.display = 'none'; document.getElementById('2501.03122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07226">arXiv:2412.07226</a> <span> [<a href="https://arxiv.org/pdf/2412.07226">pdf</a>, <a href="https://arxiv.org/format/2412.07226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Attention Head Purification: A New Perspective to Harness CLIP for Domain Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingfan Wang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07226v1-abstract-short" style="display: inline;"> Domain Generalization (DG) aims to learn a model from multiple source domains to achieve satisfactory performance on unseen target domains. Recent works introduce CLIP to DG tasks due to its superior image-text alignment and zeros-shot performance. Previous methods either utilize full fine-tuning or prompt-learning paradigms to harness CLIP for DG tasks. Those works focus on avoiding catastrophic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07226v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07226v1-abstract-full" style="display: none;"> Domain Generalization (DG) aims to learn a model from multiple source domains to achieve satisfactory performance on unseen target domains. Recent works introduce CLIP to DG tasks due to its superior image-text alignment and zeros-shot performance. Previous methods either utilize full fine-tuning or prompt-learning paradigms to harness CLIP for DG tasks. Those works focus on avoiding catastrophic forgetting of the original knowledge encoded in CLIP but ignore that the knowledge encoded in CLIP in nature may contain domain-specific cues that constrain its domain generalization performance. In this paper, we propose a new perspective to harness CLIP for DG, i.e., attention head purification. We observe that different attention heads may encode different properties of an image and selecting heads appropriately may yield remarkable performance improvement across domains. Based on such observations, we purify the attention heads of CLIP from two levels, including task-level purification and domain-level purification. For task-level purification, we design head-aware LoRA to make each head more adapted to the task we considered. For domain-level purification, we perform head selection via a simple gating strategy. We utilize MMD loss to encourage masked head features to be more domain-invariant to emphasize more generalizable properties/heads. During training, we jointly perform task-level purification and domain-level purification. We conduct experiments on various representative DG benchmarks. Though simple, extensive experiments demonstrate that our method performs favorably against previous state-of-the-arts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07226v1-abstract-full').style.display = 'none'; document.getElementById('2412.07226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06234">arXiv:2412.06234</a> <span> [<a href="https://arxiv.org/pdf/2412.06234">pdf</a>, <a href="https://arxiv.org/format/2412.06234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Generative Densification: Learning to Densify Gaussians for High-Fidelity Generalizable 3D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nam%2C+S">Seungtae Nam</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiangyu Sun</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gyeongjin Kang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Younggeun Lee</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S">Seungjun Oh</a>, <a href="/search/cs?searchtype=author&query=Park%2C+E">Eunbyung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06234v2-abstract-short" style="display: inline;"> Generalized feed-forward Gaussian models have achieved significant progress in sparse-view 3D reconstruction by leveraging prior knowledge from large multi-view datasets. However, these models often struggle to represent high-frequency details due to the limited number of Gaussians. While the densification strategy used in per-scene 3D Gaussian splatting (3D-GS) optimization can be adapted to the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06234v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06234v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06234v2-abstract-full" style="display: none;"> Generalized feed-forward Gaussian models have achieved significant progress in sparse-view 3D reconstruction by leveraging prior knowledge from large multi-view datasets. However, these models often struggle to represent high-frequency details due to the limited number of Gaussians. While the densification strategy used in per-scene 3D Gaussian splatting (3D-GS) optimization can be adapted to the feed-forward models, it may not be ideally suited for generalized scenarios. In this paper, we propose Generative Densification, an efficient and generalizable method to densify Gaussians generated by feed-forward models. Unlike the 3D-GS densification strategy, which iteratively splits and clones raw Gaussian parameters, our method up-samples feature representations from the feed-forward models and generates their corresponding fine Gaussians in a single forward pass, leveraging the embedded prior knowledge for enhanced generalization. Experimental results on both object-level and scene-level reconstruction tasks demonstrate that our method outperforms state-of-the-art approaches with comparable or smaller model sizes, achieving notable improvements in representing fine details. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06234v2-abstract-full').style.display = 'none'; document.getElementById('2412.06234v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://stnamjef.github.io/GenerativeDensification/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17190">arXiv:2411.17190</a> <span> [<a href="https://arxiv.org/pdf/2411.17190">pdf</a>, <a href="https://arxiv.org/format/2411.17190">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SelfSplat: Pose-Free and 3D Prior-Free Generalizable 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gyeongjin Kang</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+J">Jisang Yoo</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jihyeon Park</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+S">Seungtae Nam</a>, <a href="/search/cs?searchtype=author&query=Im%2C+H">Hyeonsoo Im</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+S">Sangheon Shin</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sangpil Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+E">Eunbyung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17190v3-abstract-short" style="display: inline;"> We propose SelfSplat, a novel 3D Gaussian Splatting model designed to perform pose-free and 3D prior-free generalizable 3D reconstruction from unposed multi-view images. These settings are inherently ill-posed due to the lack of ground-truth data, learned geometric information, and the need to achieve accurate 3D reconstruction without finetuning, making it difficult for conventional methods to ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17190v3-abstract-full').style.display = 'inline'; document.getElementById('2411.17190v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17190v3-abstract-full" style="display: none;"> We propose SelfSplat, a novel 3D Gaussian Splatting model designed to perform pose-free and 3D prior-free generalizable 3D reconstruction from unposed multi-view images. These settings are inherently ill-posed due to the lack of ground-truth data, learned geometric information, and the need to achieve accurate 3D reconstruction without finetuning, making it difficult for conventional methods to achieve high-quality results. Our model addresses these challenges by effectively integrating explicit 3D representations with self-supervised depth and pose estimation techniques, resulting in reciprocal improvements in both pose accuracy and 3D reconstruction quality. Furthermore, we incorporate a matching-aware pose estimation network and a depth refinement module to enhance geometry consistency across views, ensuring more accurate and stable 3D reconstructions. To present the performance of our method, we evaluated it on large-scale real-world datasets, including RealEstate10K, ACID, and DL3DV. SelfSplat achieves superior results over previous state-of-the-art methods in both appearance and geometry quality, also demonstrates strong cross-dataset generalization capabilities. Extensive ablation studies and analysis also validate the effectiveness of our proposed methods. Code and pretrained models are available at https://gynjn.github.io/selfsplat/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17190v3-abstract-full').style.display = 'none'; document.getElementById('2411.17190v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://gynjn.github.io/selfsplat/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07446">arXiv:2411.07446</a> <span> [<a href="https://arxiv.org/pdf/2411.07446">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Accurate Prompt Optimization: the Benefit of Memory in Exemplar-Guided Reflection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+C">Cilin Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruihui Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaopu Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+K">Kai Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingsong Liu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yangyang Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07446v1-abstract-short" style="display: inline;"> Automatic prompt engineering aims to enhance the generation quality of large language models (LLMs). Recent works utilize feedbacks generated from erroneous cases to guide the prompt optimization. During inference, they may further retrieve several semantically-related exemplars and concatenate them to the optimized prompts to improve the performance. However, those works only utilize the feedback… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07446v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07446v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07446v1-abstract-full" style="display: none;"> Automatic prompt engineering aims to enhance the generation quality of large language models (LLMs). Recent works utilize feedbacks generated from erroneous cases to guide the prompt optimization. During inference, they may further retrieve several semantically-related exemplars and concatenate them to the optimized prompts to improve the performance. However, those works only utilize the feedback at the current step, ignoring historical and unseleccted feedbacks which are potentially beneficial. Moreover, the selection of exemplars only considers the general semantic relationship and may not be optimal in terms of task performance and matching with the optimized prompt. In this work, we propose an Exemplar-Guided Reflection with Memory mechanism (ERM) to realize more efficient and accurate prompt optimization. Specifically, we design an exemplar-guided reflection mechanism where the feedback generation is additionally guided by the generated exemplars. We further build two kinds of memory to fully utilize the historical feedback information and support more effective exemplar retrieval. Empirical evaluations show our method surpasses previous state-of-the-arts with less optimization steps, i.e., improving F1 score by 10.1 on LIAR dataset, and reducing half of the optimization steps on ProTeGi. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07446v1-abstract-full').style.display = 'none'; document.getElementById('2411.07446v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00508">arXiv:2411.00508</a> <span> [<a href="https://arxiv.org/pdf/2411.00508">pdf</a>, <a href="https://arxiv.org/format/2411.00508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> CLIP-RT: Learning Language-Conditioned Robotic Policies from Natural Language Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gi-Cheon Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Shim%2C+K">Kyuhwan Shim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J+K">Jun Ki Lee</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Byoung-Tak Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00508v1-abstract-short" style="display: inline;"> This paper explores how non-experts can teach robots desired skills in their environments. We argue that natural language is an intuitive and accessible interface for robot learning. To this end, we investigate two key aspects: (1) how non-experts collect robotic data using natural language supervision and (2) how pre-trained vision-language models learn end-to-end policies directly from this supe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00508v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00508v1-abstract-full" style="display: none;"> This paper explores how non-experts can teach robots desired skills in their environments. We argue that natural language is an intuitive and accessible interface for robot learning. To this end, we investigate two key aspects: (1) how non-experts collect robotic data using natural language supervision and (2) how pre-trained vision-language models learn end-to-end policies directly from this supervision. We propose a data collection framework that collects robot demonstrations based on natural language supervision (e.g., "move forward") and further augments these demonstrations. Next, we introduce a model that learns language-conditioned policies from natural language supervision called CLIP-RT. Our model employs pre-trained CLIP models and learns to predict actions represented in language via contrastive imitation learning. We first train CLIP-RT on large-scale robotic data and then enable it to learn desired skills using data collected from our framework. CLIP-RT shows strong capabilities in acquiring novel manipulation skills, outperforming the state-of-the-art model, OpenVLA (7B parameters), by 17% in average success rates, while using 7x fewer parameters (1B). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00508v1-abstract-full').style.display = 'none'; document.getElementById('2411.00508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 27 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20772">arXiv:2410.20772</a> <span> [<a href="https://arxiv.org/pdf/2410.20772">pdf</a>, <a href="https://arxiv.org/format/2410.20772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Introducing Spectral Attention for Long-Range Dependency in Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+B+G">Bong Gyun Kang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dongjun Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">HyunGi Kim</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+D">DoHyun Chung</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+S">Sungroh Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20772v3-abstract-short" style="display: inline;"> Sequence modeling faces challenges in capturing long-range dependencies across diverse tasks. Recent linear and transformer-based forecasters have shown superior performance in time series forecasting. However, they are constrained by their inherent inability to effectively address long-range dependencies in time series data, primarily due to using fixed-size inputs for prediction. Furthermore, th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20772v3-abstract-full').style.display = 'inline'; document.getElementById('2410.20772v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20772v3-abstract-full" style="display: none;"> Sequence modeling faces challenges in capturing long-range dependencies across diverse tasks. Recent linear and transformer-based forecasters have shown superior performance in time series forecasting. However, they are constrained by their inherent inability to effectively address long-range dependencies in time series data, primarily due to using fixed-size inputs for prediction. Furthermore, they typically sacrifice essential temporal correlation among consecutive training samples by shuffling them into mini-batches. To overcome these limitations, we introduce a fast and effective Spectral Attention mechanism, which preserves temporal correlations among samples and facilitates the handling of long-range information while maintaining the base model structure. Spectral Attention preserves long-period trends through a low-pass filter and facilitates gradient to flow between samples. Spectral Attention can be seamlessly integrated into most sequence models, allowing models with fixed-sized look-back windows to capture long-range dependencies over thousands of steps. Through extensive experiments on 11 real-world time series datasets using 7 recent forecasting models, we consistently demonstrate the efficacy of our Spectral Attention mechanism, achieving state-of-the-art results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20772v3-abstract-full').style.display = 'none'; document.getElementById('2410.20772v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Co-first Author: Bong Gyun Kang, Dongjun Lee. NeurIPS 2024 (Conference on Neural Information Processing Systems)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17505">arXiv:2410.17505</a> <span> [<a href="https://arxiv.org/pdf/2410.17505">pdf</a>, <a href="https://arxiv.org/format/2410.17505">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PLGS: Robust Panoptic Lifting with 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xiaobao Wei</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Ming Lu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17505v1-abstract-short" style="display: inline;"> Previous methods utilize the Neural Radiance Field (NeRF) for panoptic lifting, while their training and rendering speed are unsatisfactory. In contrast, 3D Gaussian Splatting (3DGS) has emerged as a prominent technique due to its rapid training and rendering speed. However, unlike NeRF, the conventional 3DGS may not satisfy the basic smoothness assumption as it does not rely on any parameterized… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17505v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17505v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17505v1-abstract-full" style="display: none;"> Previous methods utilize the Neural Radiance Field (NeRF) for panoptic lifting, while their training and rendering speed are unsatisfactory. In contrast, 3D Gaussian Splatting (3DGS) has emerged as a prominent technique due to its rapid training and rendering speed. However, unlike NeRF, the conventional 3DGS may not satisfy the basic smoothness assumption as it does not rely on any parameterized structures to render (e.g., MLPs). Consequently, the conventional 3DGS is, in nature, more susceptible to noisy 2D mask supervision. In this paper, we propose a new method called PLGS that enables 3DGS to generate consistent panoptic segmentation masks from noisy 2D segmentation masks while maintaining superior efficiency compared to NeRF-based methods. Specifically, we build a panoptic-aware structured 3D Gaussian model to introduce smoothness and design effective noise reduction strategies. For the semantic field, instead of initialization with structure from motion, we construct reliable semantic anchor points to initialize the 3D Gaussians. We then use these anchor points as smooth regularization during training. Additionally, we present a self-training approach using pseudo labels generated by merging the rendered masks with the noisy masks to enhance the robustness of PLGS. For the instance field, we project the 2D instance masks into 3D space and match them with oriented bounding boxes to generate cross-view consistent instance masks for supervision. Experiments on various benchmarks demonstrate that our method outperforms previous state-of-the-art methods in terms of both segmentation quality and speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17505v1-abstract-full').style.display = 'none'; document.getElementById('2410.17505v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17267">arXiv:2410.17267</a> <span> [<a href="https://arxiv.org/pdf/2410.17267">pdf</a>, <a href="https://arxiv.org/format/2410.17267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Zero-Shot Vision-and-Language Navigation with Collision Mitigation in Continuous Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+S">Seongjun Jeong</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gi-Cheon Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Joochan Kim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Byoung-Tak Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17267v1-abstract-short" style="display: inline;"> We propose the zero-shot Vision-and-Language Navigation with Collision Mitigation (VLN-CM), which takes these considerations. VLN-CM is composed of four modules and predicts the direction and distance of the next movement at each step. We utilize large foundation models for each modules. To select the direction, we use the Attention Spot Predictor (ASP), View Selector (VS), and Progress Monitor (P… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17267v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17267v1-abstract-full" style="display: none;"> We propose the zero-shot Vision-and-Language Navigation with Collision Mitigation (VLN-CM), which takes these considerations. VLN-CM is composed of four modules and predicts the direction and distance of the next movement at each step. We utilize large foundation models for each modules. To select the direction, we use the Attention Spot Predictor (ASP), View Selector (VS), and Progress Monitor (PM). The ASP employs a Large Language Model (e.g. ChatGPT) to split navigation instructions into attention spots, which are objects or scenes at the location to move to (e.g. a yellow door). The VS selects from panorama images provided at 30-degree intervals the one that includes the attention spot, using CLIP similarity. We then choose the angle of the selected image as the direction to move in. The PM uses a rule-based approach to decide which attention spot to focus on next, among multiple spots derived from the instructions. If the similarity between the current attention spot and the visual observations decreases consecutively at each step, the PM determines that the agent has passed the current spot and moves on to the next one. For selecting the distance to move, we employed the Open Map Predictor (OMP). The OMP uses panorama depth information to predict an occupancy mask. We then selected a collision-free distance in the predicted direction based on the occupancy mask. We evaluated our method using the validation data of VLN-CE. Our approach showed better performance than several baseline methods, and the OPM was effective in mitigating collisions for the agent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17267v1-abstract-full').style.display = 'none'; document.getElementById('2410.17267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09606">arXiv:2410.09606</a> <span> [<a href="https://arxiv.org/pdf/2410.09606">pdf</a>, <a href="https://arxiv.org/format/2410.09606">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Collaborative Team of UAV-Hexapod for an Autonomous Retrieval System in GNSS-Denied Maritime Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seungwook Lee</a>, <a href="/search/cs?searchtype=author&query=Azhari%2C+M+B">Maulana Bisyir Azhari</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gyuree Kang</a>, <a href="/search/cs?searchtype=author&query=G%C3%BCnes%2C+O">Ozan G眉nes</a>, <a href="/search/cs?searchtype=author&query=Han%2C+D">Donghun Han</a>, <a href="/search/cs?searchtype=author&query=Shim%2C+D+H">David Hyunchul Shim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09606v1-abstract-short" style="display: inline;"> We present an integrated UAV-hexapod robotic system designed for GNSS-denied maritime operations, capable of autonomous deployment and retrieval of a hexapod robot via a winch mechanism installed on a UAV. This system is intended to address the challenges of localization, control, and mobility in dynamic maritime environments. Our solution leverages sensor fusion techniques, combining optical flow… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09606v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09606v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09606v1-abstract-full" style="display: none;"> We present an integrated UAV-hexapod robotic system designed for GNSS-denied maritime operations, capable of autonomous deployment and retrieval of a hexapod robot via a winch mechanism installed on a UAV. This system is intended to address the challenges of localization, control, and mobility in dynamic maritime environments. Our solution leverages sensor fusion techniques, combining optical flow, LiDAR, and depth data for precise localization. Experimental results demonstrate the effectiveness of this system in real-world scenarios, validating its performance during field tests in both controlled and operational conditions in the MBZIRC 2023 Maritime Challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09606v1-abstract-full').style.display = 'none'; document.getElementById('2410.09606v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19215">arXiv:2409.19215</a> <span> [<a href="https://arxiv.org/pdf/2409.19215">pdf</a>, <a href="https://arxiv.org/format/2409.19215">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 1st Place Solution to the 8th HANDS Workshop Challenge -- ARCTIC Track: 3DGS-based Bimanual Category-agnostic Interaction Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=On%2C+J">Jeongwan On</a>, <a href="/search/cs?searchtype=author&query=Gwak%2C+K">Kyeonghwan Gwak</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gunyoung Kang</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+H">Hyein Hwang</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S">Soohyun Hwang</a>, <a href="/search/cs?searchtype=author&query=Cha%2C+J">Junuk Cha</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jaewook Han</a>, <a href="/search/cs?searchtype=author&query=Baek%2C+S">Seungryul Baek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19215v2-abstract-short" style="display: inline;"> This report describes our 1st place solution to the 8th HANDS workshop challenge (ARCTIC track) in conjunction with ECCV 2024. In this challenge, we address the task of bimanual category-agnostic hand-object interaction reconstruction, which aims to generate 3D reconstructions of both hands and the object from a monocular video, without relying on predefined templates. This task is particularly ch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19215v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19215v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19215v2-abstract-full" style="display: none;"> This report describes our 1st place solution to the 8th HANDS workshop challenge (ARCTIC track) in conjunction with ECCV 2024. In this challenge, we address the task of bimanual category-agnostic hand-object interaction reconstruction, which aims to generate 3D reconstructions of both hands and the object from a monocular video, without relying on predefined templates. This task is particularly challenging due to the significant occlusion and dynamic contact between the hands and the object during bimanual manipulation. We worked to resolve these issues by introducing a mask loss and a 3D contact loss, respectively. Moreover, we applied 3D Gaussian Splatting (3DGS) to this task. As a result, our method achieved a value of 38.69 in the main metric, CD$_h$, on the ARCTIC test set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19215v2-abstract-full').style.display = 'none'; document.getElementById('2409.19215v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16181">arXiv:2409.16181</a> <span> [<a href="https://arxiv.org/pdf/2409.16181">pdf</a>, <a href="https://arxiv.org/format/2409.16181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SPIBOT: A Drone-Tethered Mobile Gripper for Robust Aerial Object Retrieval in Dynamic Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gyuree Kang</a>, <a href="/search/cs?searchtype=author&query=G%C3%BCne%C5%9F%2C+O">Ozan G眉ne艧</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seungwook Lee</a>, <a href="/search/cs?searchtype=author&query=Azhari%2C+M+B">Maulana Bisyir Azhari</a>, <a href="/search/cs?searchtype=author&query=Shim%2C+D+H">David Hyunchul Shim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16181v1-abstract-short" style="display: inline;"> In real-world field operations, aerial grasping systems face significant challenges in dynamic environments due to strong winds, shifting surfaces, and the need to handle heavy loads. Particularly when dealing with heavy objects, the powerful propellers of the drone can inadvertently blow the target object away as it approaches, making the task even more difficult. To address these challenges, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16181v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16181v1-abstract-full" style="display: none;"> In real-world field operations, aerial grasping systems face significant challenges in dynamic environments due to strong winds, shifting surfaces, and the need to handle heavy loads. Particularly when dealing with heavy objects, the powerful propellers of the drone can inadvertently blow the target object away as it approaches, making the task even more difficult. To address these challenges, we introduce SPIBOT, a novel drone-tethered mobile gripper system designed for robust and stable autonomous target retrieval. SPIBOT operates via a tether, much like a spider, allowing the drone to maintain a safe distance from the target. To ensure both stable mobility and secure grasping capabilities, SPIBOT is equipped with six legs and sensors to estimate the robot's and mission's states. It is designed with a reduced volume and weight compared to other hexapod robots, allowing it to be easily stowed under the drone and reeled in as needed. Designed for the 2024 MBZIRC Maritime Grand Challenge, SPIBOT is built to retrieve a 1kg target object in the highly dynamic conditions of the moving deck of a ship. This system integrates a real-time action selection algorithm that dynamically adjusts the robot's actions based on proximity to the mission goal and environmental conditions, enabling rapid and robust mission execution. Experimental results across various terrains, including a pontoon on a lake, a grass field, and rubber mats on coastal sand, demonstrate SPIBOT's ability to efficiently and reliably retrieve targets. SPIBOT swiftly converges on the target and completes its mission, even when dealing with irregular initial states and noisy information introduced by the drone. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16181v1-abstract-full').style.display = 'none'; document.getElementById('2409.16181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12158">arXiv:2408.12158</a> <span> [<a href="https://arxiv.org/pdf/2408.12158">pdf</a>, <a href="https://arxiv.org/format/2408.12158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Could Bibliometrics Reveal Top Science and Technology Achievements and Researchers? The Case for Evaluatology-based Science and Technology Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoxin Kang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+W">Wanling Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chunjie Luo</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hainan Ye</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qian He</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+S">Shaopeng Dai</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+J">Jianfeng Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12158v1-abstract-short" style="display: inline;"> By utilizing statistical methods to analyze bibliographic data, bibliometrics faces inherent limitations in identifying the most significant science and technology achievements and researchers. To overcome this challenge, we present an evaluatology-based science and technology evaluation methodology. At the heart of this approach lies the concept of an extended evaluation condition, encompassing e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12158v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12158v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12158v1-abstract-full" style="display: none;"> By utilizing statistical methods to analyze bibliographic data, bibliometrics faces inherent limitations in identifying the most significant science and technology achievements and researchers. To overcome this challenge, we present an evaluatology-based science and technology evaluation methodology. At the heart of this approach lies the concept of an extended evaluation condition, encompassing eight crucial components derived from a field. We define four relationships that illustrate the connections among various achievements based on their mapped extended EC components, as well as their temporal and citation links. Within a relationship under an extended evaluation condition, evaluators can effectively compare these achievements by carefully addressing the influence of confounding variables. We establish a real-world evaluation system encompassing an entire collection of achievements, each of which is mapped to several components of an extended EC. Within a specific field like chip technology or open source, we construct a perfect evaluation model that can accurately trace the evolution and development of all achievements in terms of four relationships based on the real-world evaluation system. Building upon the foundation of the perfect evaluation model, we put forth four-round rules to eliminate non-significant achievements by utilizing four relationships. This process allows us to establish a pragmatic evaluation model that effectively captures the essential achievements, serving as a curated collection of the top N achievements within a specific field during a specific timeframe. We present a case study on the top 100 Chip achievements which highlights its practical application and efficacy in identifying significant achievements and researchers that otherwise can not be identified by using bibliometrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12158v1-abstract-full').style.display = 'none'; document.getElementById('2408.12158v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 8 figures, and 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08295">arXiv:2408.08295</a> <span> [<a href="https://arxiv.org/pdf/2408.08295">pdf</a>, <a href="https://arxiv.org/format/2408.08295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SLCA++: Unleash the Power of Sequential Fine-tuning for Continual Learning with Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gengwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yunchao Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08295v1-abstract-short" style="display: inline;"> In recent years, continual learning with pre-training (CLPT) has received widespread interest, instead of its traditional focus of training from scratch. The use of strong pre-trained models (PTMs) can greatly facilitate knowledge transfer and alleviate catastrophic forgetting, but also suffers from progressive overfitting of pre-trained knowledge into specific downstream tasks. A majority of curr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08295v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08295v1-abstract-full" style="display: none;"> In recent years, continual learning with pre-training (CLPT) has received widespread interest, instead of its traditional focus of training from scratch. The use of strong pre-trained models (PTMs) can greatly facilitate knowledge transfer and alleviate catastrophic forgetting, but also suffers from progressive overfitting of pre-trained knowledge into specific downstream tasks. A majority of current efforts often keep the PTMs frozen and incorporate task-specific prompts to instruct representation learning, coupled with a prompt selection process for inference. However, due to the limited capacity of prompt parameters, this strategy demonstrates only sub-optimal performance in continual learning. In comparison, tuning all parameters of PTMs often provides the greatest potential for representation learning, making sequential fine-tuning (Seq FT) a fundamental baseline that has been overlooked in CLPT. To this end, we present an in-depth analysis of the progressive overfitting problem from the lens of Seq FT. Considering that the overly fast representation learning and the biased classification layer constitute this particular problem, we introduce the advanced Slow Learner with Classifier Alignment (SLCA++) framework to unleash the power of Seq FT, serving as a strong baseline approach for CLPT. Our approach involves a Slow Learner to selectively reduce the learning rate of backbone parameters, and a Classifier Alignment to align the disjoint classification layers in a post-hoc fashion. We further enhance the efficacy of SL with a symmetric cross-entropy loss, as well as employ a parameter-efficient strategy to implement Seq FT with SLCA++. Across a variety of continual learning scenarios on image classification benchmarks, our approach provides substantial improvements and outperforms state-of-the-art methods by a large margin. Code: https://github.com/GengDavid/SLCA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08295v1-abstract-full').style.display = 'none'; document.getElementById('2408.08295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is an extension of our ICCV 23 paper (arXiv:2303.05118)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08087">arXiv:2408.08087</a> <span> [<a href="https://arxiv.org/pdf/2408.08087">pdf</a>, <a href="https://arxiv.org/format/2408.08087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ColorMamba: Towards High-quality NIR-to-RGB Spectral Translation with Mamba </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+H">Huiyu Zhai</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+G">Guang Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xingxing Yang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guosheng Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08087v1-abstract-short" style="display: inline;"> Translating NIR to the visible spectrum is challenging due to cross-domain complexities. Current models struggle to balance a broad receptive field with computational efficiency, limiting practical use. Although the Selective Structured State Space Model, especially the improved version, Mamba, excels in generative tasks by capturing long-range dependencies with linear complexity, its default appr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08087v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08087v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08087v1-abstract-full" style="display: none;"> Translating NIR to the visible spectrum is challenging due to cross-domain complexities. Current models struggle to balance a broad receptive field with computational efficiency, limiting practical use. Although the Selective Structured State Space Model, especially the improved version, Mamba, excels in generative tasks by capturing long-range dependencies with linear complexity, its default approach of converting 2D images into 1D sequences neglects local context. In this work, we propose a simple but effective backbone, dubbed ColorMamba, which first introduces Mamba into spectral translation tasks. To explore global long-range dependencies and local context for efficient spectral translation, we introduce learnable padding tokens to enhance the distinction of image boundaries and prevent potential confusion within the sequence model. Furthermore, local convolutional enhancement and agent attention are designed to improve the vanilla Mamba. Moreover, we exploit the HSV color to provide multi-scale guidance in the reconstruction process for more accurate spectral translation. Extensive experiments show that our ColorMamba achieves a 1.02 improvement in terms of PSNR compared with the state-of-the-art method. Our code is available at https://github.com/AlexYangxx/ColorMamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08087v1-abstract-full').style.display = 'none'; document.getElementById('2408.08087v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/AlexYangxx/ColorMamba</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06747">arXiv:2408.06747</a> <span> [<a href="https://arxiv.org/pdf/2408.06747">pdf</a>, <a href="https://arxiv.org/format/2408.06747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ReCLIP++: Learn to Rectify the Bias of CLIP for Unsupervised Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyun Wang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06747v2-abstract-short" style="display: inline;"> Recent works utilize CLIP to perform the challenging unsupervised semantic segmentation task where only images without annotations are available. However, we observe that when adopting CLIP to such a pixel-level understanding task, unexpected bias (including class-preference bias and space-preference bias) occurs. Previous works don't explicitly model the bias, which largely constrains the segment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06747v2-abstract-full').style.display = 'inline'; document.getElementById('2408.06747v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06747v2-abstract-full" style="display: none;"> Recent works utilize CLIP to perform the challenging unsupervised semantic segmentation task where only images without annotations are available. However, we observe that when adopting CLIP to such a pixel-level understanding task, unexpected bias (including class-preference bias and space-preference bias) occurs. Previous works don't explicitly model the bias, which largely constrains the segmentation performance. In this paper, we propose to explicitly model and rectify the bias existing in CLIP to facilitate the unsupervised semantic segmentation task. Specifically, we design a learnable "Reference" prompt to encode class-preference bias and a projection of the positional embedding in the vision transformer to encode space-preference bias respectively. To avoid interference, two kinds of biases are firstly independently encoded into different features, i.e., the Reference feature and the positional feature. Via a matrix multiplication between the Reference feature and the positional feature, a bias logit map is generated to explicitly represent two kinds of biases. Then we rectify the logits of CLIP via a simple element-wise subtraction. To make the rectified results smoother and more contextual, we design a mask decoder which takes the feature of CLIP and the rectified logits as input and outputs a rectified segmentation mask with the help of Gumbel-Softmax operation. A contrastive loss based on the masked visual features and the text features of different classes is imposed, which makes the bias modeling and rectification process meaningful and effective. Extensive experiments on various benchmarks including PASCAL VOC, PASCAL Context, ADE20K, Cityscapes, and COCO Stuff demonstrate that our method performs favorably against previous state-of-the-arts. The implementation is available at: https://github.com/dogehhh/ReCLIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06747v2-abstract-full').style.display = 'none'; document.getElementById('2408.06747v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended version of our CVPR 24 paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16770">arXiv:2407.16770</a> <span> [<a href="https://arxiv.org/pdf/2407.16770">pdf</a>, <a href="https://arxiv.org/format/2407.16770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Infinite Ends from Finite Samples: Open-Ended Goal Inference as Top-Down Bayesian Filtering of Bottom-Up Proposals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhi-Xuan%2C+T">Tan Zhi-Xuan</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gloria Kang</a>, <a href="/search/cs?searchtype=author&query=Mansinghka%2C+V">Vikash Mansinghka</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16770v1-abstract-short" style="display: inline;"> The space of human goals is tremendously vast; and yet, from just a few moments of watching a scene or reading a story, we seem to spontaneously infer a range of plausible motivations for the people and characters involved. What explains this remarkable capacity for intuiting other agents' goals, despite the infinitude of ends they might pursue? And how does this cohere with our understanding of o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16770v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16770v1-abstract-full" style="display: none;"> The space of human goals is tremendously vast; and yet, from just a few moments of watching a scene or reading a story, we seem to spontaneously infer a range of plausible motivations for the people and characters involved. What explains this remarkable capacity for intuiting other agents' goals, despite the infinitude of ends they might pursue? And how does this cohere with our understanding of other people as approximately rational agents? In this paper, we introduce a sequential Monte Carlo model of open-ended goal inference, which combines top-down Bayesian inverse planning with bottom-up sampling based on the statistics of co-occurring subgoals. By proposing goal hypotheses related to the subgoals achieved by an agent, our model rapidly generates plausible goals without exhaustive search, then filters out goals that would be irrational given the actions taken so far. We validate this model in a goal inference task called Block Words, where participants try to guess the word that someone is stacking out of lettered blocks. In comparison to both heuristic bottom-up guessing and exact Bayesian inference over hundreds of goals, our model better predicts the mean, variance, efficiency, and resource rationality of human goal inferences, achieving similar accuracy to the exact model at a fraction of the cognitive cost, while also explaining garden-path effects that arise from misleading bottom-up cues. Our experiments thus highlight the importance of uniting top-down and bottom-up models for explaining the speed, accuracy, and generality of human theory-of-mind. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16770v1-abstract-full').style.display = 'none'; document.getElementById('2407.16770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at CogSci 2024. 6 pages, 4 figures. (Appendix: 5 pages, 6 figures, 2 tables)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11325">arXiv:2407.11325</a> <span> [<a href="https://arxiv.org/pdf/2407.11325">pdf</a>, <a href="https://arxiv.org/format/2407.11325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VISA: Reasoning Video Object Segmentation via Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+C">Cilin Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haochen Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shilin Yan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaolong Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Weidi Xie</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11325v1-abstract-short" style="display: inline;"> Existing Video Object Segmentation (VOS) relies on explicit user instructions, such as categories, masks, or short phrases, restricting their ability to perform complex video segmentation requiring reasoning with world knowledge. In this paper, we introduce a new task, Reasoning Video Object Segmentation (ReasonVOS). This task aims to generate a sequence of segmentation masks in response to implic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11325v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11325v1-abstract-full" style="display: none;"> Existing Video Object Segmentation (VOS) relies on explicit user instructions, such as categories, masks, or short phrases, restricting their ability to perform complex video segmentation requiring reasoning with world knowledge. In this paper, we introduce a new task, Reasoning Video Object Segmentation (ReasonVOS). This task aims to generate a sequence of segmentation masks in response to implicit text queries that require complex reasoning abilities based on world knowledge and video contexts, which is crucial for structured environment understanding and object-centric interactions, pivotal in the development of embodied AI. To tackle ReasonVOS, we introduce VISA (Video-based large language Instructed Segmentation Assistant), to leverage the world knowledge reasoning capabilities of multi-modal LLMs while possessing the ability to segment and track objects in videos with a mask decoder. Moreover, we establish a comprehensive benchmark consisting of 35,074 instruction-mask sequence pairs from 1,042 diverse videos, which incorporates complex world knowledge reasoning into segmentation tasks for instruction-tuning and evaluation purposes of ReasonVOS models. Experiments conducted on 8 datasets demonstrate the effectiveness of VISA in tackling complex reasoning segmentation and vanilla referring segmentation in both video and image domains. The code and dataset are available at https://github.com/cilinyan/VISA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11325v1-abstract-full').style.display = 'none'; document.getElementById('2407.11325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11252">arXiv:2406.11252</a> <span> [<a href="https://arxiv.org/pdf/2406.11252">pdf</a>, <a href="https://arxiv.org/format/2406.11252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mining Open Semantics from CLIP: A Relation Transition Perspective for Few-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+C">Cilin Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haochen Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaolong Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xu Tang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11252v2-abstract-short" style="display: inline;"> Contrastive Vision-Language Pre-training(CLIP) demonstrates impressive zero-shot capability. The key to improve the adaptation of CLIP to downstream task with few exemplars lies in how to effectively model and transfer the useful knowledge embedded in CLIP. Previous work mines the knowledge typically based on the limited visual samples and close-set semantics (i.e., within target category set of d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11252v2-abstract-full').style.display = 'inline'; document.getElementById('2406.11252v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11252v2-abstract-full" style="display: none;"> Contrastive Vision-Language Pre-training(CLIP) demonstrates impressive zero-shot capability. The key to improve the adaptation of CLIP to downstream task with few exemplars lies in how to effectively model and transfer the useful knowledge embedded in CLIP. Previous work mines the knowledge typically based on the limited visual samples and close-set semantics (i.e., within target category set of downstream task). However, the aligned CLIP image/text encoders contain abundant relationships between visual features and almost infinite open semantics, which may benefit the few-shot learning but remains unexplored. In this paper, we propose to mine open semantics as anchors to perform a relation transition from image-anchor relationship to image-target relationship to make predictions. Specifically, we adopt a transformer module which takes the visual feature as "Query", the text features of the anchors as "Key" and the similarity matrix between the text features of anchor and target classes as "Value". In this way, the output of such a transformer module represents the relationship between the image and target categories, i.e., the classification predictions. To avoid manually selecting the open semantics, we make the [CLASS] token of input text embedding learnable. We conduct extensive experiments on eleven representative classification datasets. The results show that our method performs favorably against previous state-of-the-arts considering few-shot classification settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11252v2-abstract-full').style.display = 'none'; document.getElementById('2406.11252v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20233">arXiv:2405.20233</a> <span> [<a href="https://arxiv.org/pdf/2405.20233">pdf</a>, <a href="https://arxiv.org/format/2405.20233">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Grokfast: Accelerated Grokking by Amplifying Slow Gradients </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jaerin Lee</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+B+G">Bong Gyun Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kihoon Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K+M">Kyoung Mu Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20233v2-abstract-short" style="display: inline;"> One puzzling artifact in machine learning dubbed grokking is where delayed generalization is achieved tenfolds of iterations after near perfect overfitting to the training data. Focusing on the long delay itself on behalf of machine learning practitioners, our goal is to accelerate generalization of a model under grokking phenomenon. By regarding a series of gradients of a parameter over training… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20233v2-abstract-full').style.display = 'inline'; document.getElementById('2405.20233v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20233v2-abstract-full" style="display: none;"> One puzzling artifact in machine learning dubbed grokking is where delayed generalization is achieved tenfolds of iterations after near perfect overfitting to the training data. Focusing on the long delay itself on behalf of machine learning practitioners, our goal is to accelerate generalization of a model under grokking phenomenon. By regarding a series of gradients of a parameter over training iterations as a random signal over time, we can spectrally decompose the parameter trajectories under gradient descent into two components: the fast-varying, overfitting-yielding component and the slow-varying, generalization-inducing component. This analysis allows us to accelerate the grokking phenomenon more than $\times 50$ with only a few lines of code that amplifies the slow-varying components of gradients. The experiments show that our algorithm applies to diverse tasks involving images, languages, and graphs, enabling practical availability of this peculiar artifact of sudden generalization. Our code is available at https://github.com/ironjr/grokfast. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20233v2-abstract-full').style.display = 'none'; document.getElementById('2405.20233v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 13 figures. Typo fixed. Project page: https://jaerinlee.com/research/grokfast</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16685">arXiv:2404.16685</a> <span> [<a href="https://arxiv.org/pdf/2404.16685">pdf</a>, <a href="https://arxiv.org/format/2404.16685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-scale HSV Color Feature Embedding for High-fidelity NIR-to-RGB Spectrum Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+H">Huiyu Zhai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mo Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xingxing Yang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gusheng Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16685v1-abstract-short" style="display: inline;"> The NIR-to-RGB spectral domain translation is a formidable task due to the inherent spectral mapping ambiguities within NIR inputs and RGB outputs. Thus, existing methods fail to reconcile the tension between maintaining texture detail fidelity and achieving diverse color variations. In this paper, we propose a Multi-scale HSV Color Feature Embedding Network (MCFNet) that decomposes the mapping pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16685v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16685v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16685v1-abstract-full" style="display: none;"> The NIR-to-RGB spectral domain translation is a formidable task due to the inherent spectral mapping ambiguities within NIR inputs and RGB outputs. Thus, existing methods fail to reconcile the tension between maintaining texture detail fidelity and achieving diverse color variations. In this paper, we propose a Multi-scale HSV Color Feature Embedding Network (MCFNet) that decomposes the mapping process into three sub-tasks, including NIR texture maintenance, coarse geometry reconstruction, and RGB color prediction. Thus, we propose three key modules for each corresponding sub-task: the Texture Preserving Block (TPB), the HSV Color Feature Embedding Module (HSV-CFEM), and the Geometry Reconstruction Module (GRM). These modules contribute to our MCFNet methodically tackling spectral translation through a series of escalating resolutions, progressively enriching images with color and texture fidelity in a scale-coherent fashion. The proposed MCFNet demonstrates substantial performance gains over the NIR image colorization task. Code is released at: https://github.com/AlexYangxx/MCFNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16685v1-abstract-full').style.display = 'none'; document.getElementById('2404.16685v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15190">arXiv:2404.15190</a> <span> [<a href="https://arxiv.org/pdf/2404.15190">pdf</a>, <a href="https://arxiv.org/format/2404.15190">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Socratic Planner: Inquiry-Based Zero-Shot Planning for Embodied Instruction Following </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shin%2C+S">Suyeon Shin</a>, <a href="/search/cs?searchtype=author&query=jeon%2C+S">Sujin jeon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gi-Cheon Kang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Byoung-Tak Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15190v1-abstract-short" style="display: inline;"> Embodied Instruction Following (EIF) is the task of executing natural language instructions by navigating and interacting with objects in 3D environments. One of the primary challenges in EIF is compositional task planning, which is often addressed with supervised or in-context learning with labeled data. To this end, we introduce the Socratic Planner, the first zero-shot planning method that infe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15190v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15190v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15190v1-abstract-full" style="display: none;"> Embodied Instruction Following (EIF) is the task of executing natural language instructions by navigating and interacting with objects in 3D environments. One of the primary challenges in EIF is compositional task planning, which is often addressed with supervised or in-context learning with labeled data. To this end, we introduce the Socratic Planner, the first zero-shot planning method that infers without the need for any training data. Socratic Planner first decomposes the instructions into substructural information of the task through self-questioning and answering, translating it into a high-level plan, i.e., a sequence of subgoals. Subgoals are executed sequentially, with our visually grounded re-planning mechanism adjusting plans dynamically through a dense visual feedback. We also introduce an evaluation metric of high-level plans, RelaxedHLP, for a more comprehensive evaluation. Experiments demonstrate the effectiveness of the Socratic Planner, achieving competitive performance on both zero-shot and few-shot task planning in the ALFRED benchmark, particularly excelling in tasks requiring higher-dimensional inference. Additionally, a precise adjustments in the plan were achieved by incorporating environmental visual information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15190v1-abstract-full').style.display = 'none'; document.getElementById('2404.15190v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T01 (Primary) 68T40; 68T50; 68T45 (Secondary) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04913">arXiv:2404.04913</a> <span> [<a href="https://arxiv.org/pdf/2404.04913">pdf</a>, <a href="https://arxiv.org/format/2404.04913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CodecNeRF: Toward Fast Encoding and Decoding, Compact, and High-quality Novel-view Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gyeongjin Kang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Younggeun Lee</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S">Seungjun Oh</a>, <a href="/search/cs?searchtype=author&query=Park%2C+E">Eunbyung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04913v3-abstract-short" style="display: inline;"> Neural Radiance Fields (NeRF) have achieved huge success in effectively capturing and representing 3D objects and scenes. However, to establish a ubiquitous presence in everyday media formats, such as images and videos, we need to fulfill three key objectives: 1. fast encoding and decoding time, 2. compact model sizes, and 3. high-quality renderings. Despite recent advancements, a comprehensive al… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04913v3-abstract-full').style.display = 'inline'; document.getElementById('2404.04913v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04913v3-abstract-full" style="display: none;"> Neural Radiance Fields (NeRF) have achieved huge success in effectively capturing and representing 3D objects and scenes. However, to establish a ubiquitous presence in everyday media formats, such as images and videos, we need to fulfill three key objectives: 1. fast encoding and decoding time, 2. compact model sizes, and 3. high-quality renderings. Despite recent advancements, a comprehensive algorithm that adequately addresses all objectives has yet to be fully realized. In this work, we present CodecNeRF, a neural codec for NeRF representations, consisting of an encoder and decoder architecture that can generate a NeRF representation in a single forward pass. Furthermore, inspired by the recent parameter-efficient finetuning approaches, we propose a finetuning method to efficiently adapt the generated NeRF representations to a new test instance, leading to high-quality image renderings and compact code sizes. The proposed CodecNeRF, a newly suggested encoding-decoding-finetuning pipeline for NeRF, achieved unprecedented compression performance of more than 100x and remarkable reduction in encoding time while maintaining (or improving) the image quality on widely used 3D object datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04913v3-abstract-full').style.display = 'none'; document.getElementById('2404.04913v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://gynjn.github.io/CodecNeRF/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00021">arXiv:2404.00021</a> <span> [<a href="https://arxiv.org/pdf/2404.00021">pdf</a>, <a href="https://arxiv.org/format/2404.00021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Evaluatology: The Science and Engineering of Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhan%2C+J">Jianfeng Zhan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+W">Wanling Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongxiao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenxi Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yunyou Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yatao Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengxin Yang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoxin Kang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chunjie Luo</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hainan Ye</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+S">Shaopeng Dai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00021v1-abstract-short" style="display: inline;"> Evaluation is a crucial aspect of human existence and plays a vital role in various fields. However, it is often approached in an empirical and ad-hoc manner, lacking consensus on universal concepts, terminologies, theories, and methodologies. This lack of agreement has significant repercussions. This article aims to formally introduce the discipline of evaluatology, which encompasses the science… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00021v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00021v1-abstract-full" style="display: none;"> Evaluation is a crucial aspect of human existence and plays a vital role in various fields. However, it is often approached in an empirical and ad-hoc manner, lacking consensus on universal concepts, terminologies, theories, and methodologies. This lack of agreement has significant repercussions. This article aims to formally introduce the discipline of evaluatology, which encompasses the science and engineering of evaluation. We propose a universal framework for evaluation, encompassing concepts, terminologies, theories, and methodologies that can be applied across various disciplines. Our research reveals that the essence of evaluation lies in conducting experiments that intentionally apply a well-defined evaluation condition to diverse subjects and infer the impact of different subjects by measuring and/or testing. Derived from the essence of evaluation, we propose five axioms focusing on key aspects of evaluation outcomes as the foundational evaluation theory. These axioms serve as the bedrock upon which we build universal evaluation theories and methodologies. When evaluating a single subject, it is crucial to create evaluation conditions with different levels of equivalency. By applying these conditions to diverse subjects, we can establish reference evaluation models. These models allow us to alter a single independent variable at a time while keeping all other variables as controls. When evaluating complex scenarios, the key lies in establishing a series of evaluation models that maintain transitivity. Building upon the science of evaluation, we propose a formal definition of a benchmark as a simplified and sampled evaluation condition that guarantees different levels of equivalency. This concept serves as the cornerstone for a universal benchmark-based engineering approach to evaluation across various disciplines, which we refer to as benchmarkology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00021v1-abstract-full').style.display = 'none'; document.getElementById('2404.00021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 16 figures, and 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15049">arXiv:2403.15049</a> <span> [<a href="https://arxiv.org/pdf/2403.15049">pdf</a>, <a href="https://arxiv.org/format/2403.15049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Continual Vision-and-Language Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+S">Seongjun Jeong</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gi-Cheon Kang</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+S">Seongho Choi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Joochan Kim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Byoung-Tak Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15049v2-abstract-short" style="display: inline;"> In developing Vision-and-Language Navigation (VLN) agents that navigate to a destination using natural language instructions and visual cues, current studies largely assume a \textit{train-once-deploy-once strategy}. We argue that this kind of strategy is less realistic, as deployed VLN agents are expected to encounter novel environments continuously through their lifetime. To facilitate more real… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15049v2-abstract-full').style.display = 'inline'; document.getElementById('2403.15049v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15049v2-abstract-full" style="display: none;"> In developing Vision-and-Language Navigation (VLN) agents that navigate to a destination using natural language instructions and visual cues, current studies largely assume a \textit{train-once-deploy-once strategy}. We argue that this kind of strategy is less realistic, as deployed VLN agents are expected to encounter novel environments continuously through their lifetime. To facilitate more realistic setting for VLN agents, we propose Continual Vision-and-Language Navigation (CVLN) paradigm for agents to continually learn and adapt to changing environments. In CVLN, the agents are trained and evaluated incrementally across multiple \textit{scene domains} (i.e., environments). We present two CVLN learning setups to consider diverse forms of natural language instructions: Initial-instruction based CVLN, focused on navigation via initial-instruction interpretation, and dialogue-based CVLN, designed for navigation through dialogue with other agents. We introduce two simple yet effective baseline methods, tailored to the sequential decision-making needs of CVLN: Perplexity Replay (PerpR) and Episodic Self-Replay (ESR), both employing a rehearsal mechanism. PerpR selects replay episodes based on episode difficulty, while ESR stores and revisits action logits from individual episode steps during training to refine learning. Experimental results indicate that while existing continual learning methods are insufficient for CVLN, PerpR and ESR outperform the comparison methods by effectively utilizing replay memory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15049v2-abstract-full').style.display = 'none'; document.getElementById('2403.15049v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.16808">arXiv:2401.16808</a> <span> [<a href="https://arxiv.org/pdf/2401.16808">pdf</a>, <a href="https://arxiv.org/format/2401.16808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Encoding Temporal Statistical-space Priors via Augmented Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+I">Insu Choi</a>, <a href="/search/cs?searchtype=author&query=Koh%2C+W">Woosung Koh</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gimin Kang</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+Y">Yuntae Jang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+W+C">Woo Chang Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.16808v3-abstract-short" style="display: inline;"> Modeling time series data remains a pervasive issue as the temporal dimension is inherent to numerous domains. Despite significant strides in time series forecasting, high noise-to-signal ratio, non-normality, non-stationarity, and lack of data continue challenging practitioners. In response, we leverage a simple representation augmentation technique to overcome these challenges. Our augmented rep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16808v3-abstract-full').style.display = 'inline'; document.getElementById('2401.16808v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.16808v3-abstract-full" style="display: none;"> Modeling time series data remains a pervasive issue as the temporal dimension is inherent to numerous domains. Despite significant strides in time series forecasting, high noise-to-signal ratio, non-normality, non-stationarity, and lack of data continue challenging practitioners. In response, we leverage a simple representation augmentation technique to overcome these challenges. Our augmented representation acts as a statistical-space prior encoded at each time step. In response, we name our method Statistical-space Augmented Representation (SSAR). The underlying high-dimensional data-generating process inspires our representation augmentation. We rigorously examine the empirical generalization performance on two data sets with two downstream temporal learning algorithms. Our approach significantly beats all five up-to-date baselines. Moreover, the highly modular nature of our approach can easily be applied to various settings. Lastly, fully-fledged theoretical perspectives are available throughout the writing for a clear and rigorous understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16808v3-abstract-full').style.display = 'none'; document.getElementById('2401.16808v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IJCAI 2024 STRL Workshop (Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.14611">arXiv:2312.14611</a> <span> [<a href="https://arxiv.org/pdf/2312.14611">pdf</a>, <a href="https://arxiv.org/format/2312.14611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tuning-Free Inversion-Enhanced Control for Consistent Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+X">Xiaoyue Duan</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shuhao Cui</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Baochang Zhang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+Z">Zhengcong Fei</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+M">Mingyuan Fan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junshi Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.14611v1-abstract-short" style="display: inline;"> Consistent editing of real images is a challenging task, as it requires performing non-rigid edits (e.g., changing postures) to the main objects in the input image without changing their identity or attributes. To guarantee consistent attributes, some existing methods fine-tune the entire model or the textual embedding for structural consistency, but they are time-consuming and fail to perform non… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.14611v1-abstract-full').style.display = 'inline'; document.getElementById('2312.14611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.14611v1-abstract-full" style="display: none;"> Consistent editing of real images is a challenging task, as it requires performing non-rigid edits (e.g., changing postures) to the main objects in the input image without changing their identity or attributes. To guarantee consistent attributes, some existing methods fine-tune the entire model or the textual embedding for structural consistency, but they are time-consuming and fail to perform non-rigid edits. Other works are tuning-free, but their performances are weakened by the quality of Denoising Diffusion Implicit Model (DDIM) reconstruction, which often fails in real-world scenarios. In this paper, we present a novel approach called Tuning-free Inversion-enhanced Control (TIC), which directly correlates features from the inversion process with those from the sampling process to mitigate the inconsistency in DDIM reconstruction. Specifically, our method effectively obtains inversion features from the key and value features in the self-attention layers, and enhances the sampling process by these inversion features, thus achieving accurate reconstruction and content-consistent editing. To extend the applicability of our method to general editing scenarios, we also propose a mask-guided attention concatenation strategy that combines contents from both the inversion and the naive DDIM editing processes. Experiments show that the proposed method outperforms previous works in reconstruction and consistent editing, and produces impressive results in various settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.14611v1-abstract-full').style.display = 'none'; document.getElementById('2312.14611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.13326">arXiv:2311.13326</a> <span> [<a href="https://arxiv.org/pdf/2311.13326">pdf</a>, <a href="https://arxiv.org/format/2311.13326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Portfolio Management">q-fin.PM</span> </div> </div> <p class="title is-5 mathjax"> Curriculum Learning and Imitation Learning for Model-free Control on Financial Time-series </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Koh%2C+W">Woosung Koh</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+I">Insu Choi</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+Y">Yuntae Jang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gimin Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+W+C">Woo Chang Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.13326v4-abstract-short" style="display: inline;"> Curriculum learning and imitation learning have been leveraged extensively in the robotics domain. However, minimal research has been done on leveraging these ideas on control tasks over highly stochastic time-series data. Here, we theoretically and empirically explore these approaches in a representative control task over complex time-series data. We implement the fundamental ideas of curriculum… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.13326v4-abstract-full').style.display = 'inline'; document.getElementById('2311.13326v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.13326v4-abstract-full" style="display: none;"> Curriculum learning and imitation learning have been leveraged extensively in the robotics domain. However, minimal research has been done on leveraging these ideas on control tasks over highly stochastic time-series data. Here, we theoretically and empirically explore these approaches in a representative control task over complex time-series data. We implement the fundamental ideas of curriculum learning via data augmentation, while imitation learning is implemented via policy distillation from an oracle. Our findings reveal that curriculum learning should be considered a novel direction in improving control-task performance over complex time-series. Our ample random-seed out-sample empirics and ablation studies are highly encouraging for curriculum learning for time-series control. These findings are especially encouraging as we tune all overlapping hyperparameters on the baseline -- giving an advantage to the baseline. On the other hand, we find that imitation learning should be used with caution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.13326v4-abstract-full').style.display = 'none'; document.getElementById('2311.13326v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2024 AI4TS Workshop Oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.00353">arXiv:2311.00353</a> <span> [<a href="https://arxiv.org/pdf/2311.00353">pdf</a>, <a href="https://arxiv.org/format/2311.00353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LatentWarp: Consistent Diffusion Latents for Zero-Shot Video-to-Video Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yuxiang Bao</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+D">Di Qiu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Baochang Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bo Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaiye Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Pengfei Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.00353v1-abstract-short" style="display: inline;"> Leveraging the generative ability of image diffusion models offers great potential for zero-shot video-to-video translation. The key lies in how to maintain temporal consistency across generated video frames by image diffusion models. Previous methods typically adopt cross-frame attention, \emph{i.e.,} sharing the \textit{key} and \textit{value} tokens across attentions of different frames, to enc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00353v1-abstract-full').style.display = 'inline'; document.getElementById('2311.00353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.00353v1-abstract-full" style="display: none;"> Leveraging the generative ability of image diffusion models offers great potential for zero-shot video-to-video translation. The key lies in how to maintain temporal consistency across generated video frames by image diffusion models. Previous methods typically adopt cross-frame attention, \emph{i.e.,} sharing the \textit{key} and \textit{value} tokens across attentions of different frames, to encourage the temporal consistency. However, in those works, temporal inconsistency issue may not be thoroughly solved, rendering the fidelity of generated videos limited.%The current state of the art cross-frame attention method aims at maintaining fine-grained visual details across frames, but it is still challenged by the temporal coherence problem. In this paper, we find the bottleneck lies in the unconstrained query tokens and propose a new zero-shot video-to-video translation framework, named \textit{LatentWarp}. Our approach is simple: to constrain the query tokens to be temporally consistent, we further incorporate a warping operation in the latent space to constrain the query tokens. Specifically, based on the optical flow obtained from the original video, we warp the generated latent features of last frame to align with the current frame during the denoising process. As a result, the corresponding regions across the adjacent frames can share closely-related query tokens and attention outputs, which can further improve latent-level consistency to enhance visual temporal coherence of generated videos. Extensive experiment results demonstrate the superiority of \textit{LatentWarp} in achieving video-to-video translation with temporal coherence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00353v1-abstract-full').style.display = 'none'; document.getElementById('2311.00353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.19202">arXiv:2310.19202</a> <span> [<a href="https://arxiv.org/pdf/2310.19202">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Improved Motor Imagery Classification Using Adaptive Spatial Filters Based on Particle Swarm Optimization Algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+X">Xiong Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Ying Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+T">Tianyuan Song</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jinguo Huang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guixia Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.19202v1-abstract-short" style="display: inline;"> As a typical self-paced brain-computer interface (BCI) system, the motor imagery (MI) BCI has been widely applied in fields such as robot control, stroke rehabilitation, and assistance for patients with stroke or spinal cord injury. Many studies have focused on the traditional spatial filters obtained through the common spatial pattern (CSP) method. However, the CSP method can only obtain fixed sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19202v1-abstract-full').style.display = 'inline'; document.getElementById('2310.19202v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.19202v1-abstract-full" style="display: none;"> As a typical self-paced brain-computer interface (BCI) system, the motor imagery (MI) BCI has been widely applied in fields such as robot control, stroke rehabilitation, and assistance for patients with stroke or spinal cord injury. Many studies have focused on the traditional spatial filters obtained through the common spatial pattern (CSP) method. However, the CSP method can only obtain fixed spatial filters for specific input signals. Besides, CSP method only focuses on the variance difference of two types of electroencephalogram (EEG) signals, so the decoding ability of EEG signals is limited. To obtain more effective spatial filters for better extraction of spatial features that can improve classification to MI-EEG, this paper proposes an adaptive spatial filter solving method based on particle swarm optimization algorithm (PSO). A training and testing framework based on filter bank and spatial filters (FBCSP-ASP) is designed for MI EEG signal classification. Comparative experiments are conducted on two public datasets (2a and 2b) from BCI competition IV, which show the outstanding average recognition accuracy of FBCSP-ASP. The proposed method has achieved significant performance improvement on MI-BCI. The classification accuracy of the proposed method has reached 74.61% and 81.19% on datasets 2a and 2b, respectively. Compared with the baseline algorithm (FBCSP), the proposed algorithm improves 11.44% and 7.11% on two datasets respectively. Furthermore, the analysis based on mutual information, t-SNE and Shapley values further proves that ASP features have excellent decoding ability for MI-EEG signals, and explains the improvement of classification performance by the introduction of ASP features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19202v1-abstract-full').style.display = 'none'; document.getElementById('2310.19202v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.19198">arXiv:2310.19198</a> <span> [<a href="https://arxiv.org/pdf/2310.19198">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Motor Imagery Decoding in Brain Computer Interfaces using Riemann Tangent Space Mapping and Cross Frequency Coupling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+X">Xiong Xiong</a>, <a href="/search/cs?searchtype=author&query=Su%2C+L">Li Su</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jinguo Huang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guixia Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.19198v1-abstract-short" style="display: inline;"> Objective: Motor Imagery (MI) serves as a crucial experimental paradigm within the realm of Brain Computer Interfaces (BCIs), aiming to decoding motor intentions from electroencephalogram (EEG) signals. Method: Drawing inspiration from Riemannian geometry and Cross-Frequency Coupling (CFC), this paper introduces a novel approach termed Riemann Tangent Space Mapping using Dichotomous Filter Bank wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19198v1-abstract-full').style.display = 'inline'; document.getElementById('2310.19198v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.19198v1-abstract-full" style="display: none;"> Objective: Motor Imagery (MI) serves as a crucial experimental paradigm within the realm of Brain Computer Interfaces (BCIs), aiming to decoding motor intentions from electroencephalogram (EEG) signals. Method: Drawing inspiration from Riemannian geometry and Cross-Frequency Coupling (CFC), this paper introduces a novel approach termed Riemann Tangent Space Mapping using Dichotomous Filter Bank with Convolutional Neural Network (DFBRTS) to enhance the representation quality and decoding capability pertaining to MI features. DFBRTS first initiates the process by meticulously filtering EEG signals through a Dichotomous Filter Bank, structured in the fashion of a complete binary tree. Subsequently, it employs Riemann Tangent Space Mapping to extract salient EEG signal features within each sub-band. Finally, a lightweight convolutional neural network is employed for further feature extraction and classification, operating under the joint supervision of cross-entropy and center loss. To validate the efficacy, extensive experiments were conducted using DFBRTS on two well-established benchmark datasets: the BCI competition IV 2a (BCIC-IV-2a) dataset and the OpenBMI dataset. The performance of DFBRTS was benchmarked against several state-of-the-art MI decoding methods, alongside other Riemannian geometry-based MI decoding approaches. Results: DFBRTS significantly outperforms other MI decoding algorithms on both datasets, achieving a remarkable classification accuracy of 78.16% for four-class and 71.58% for two-class hold-out classification, as compared to the existing benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19198v1-abstract-full').style.display = 'none'; document.getElementById('2310.19198v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.12547">arXiv:2310.12547</a> <span> [<a href="https://arxiv.org/pdf/2310.12547">pdf</a>, <a href="https://arxiv.org/format/2310.12547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PGA: Personalizing Grasping Agents with Single Human-Robot Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gi-Cheon Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jaein Kim</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Seoyun Yang</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+M">Minjoon Jung</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Byoung-Tak Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.12547v2-abstract-short" style="display: inline;"> Language-Conditioned Robotic Grasping (LCRG) aims to develop robots that comprehend and grasp objects based on natural language instructions. While the ability to understand personal objects like my wallet facilitates more natural interaction with human users, current LCRG systems only allow generic language instructions, e.g., the black-colored wallet next to the laptop. To this end, we introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12547v2-abstract-full').style.display = 'inline'; document.getElementById('2310.12547v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.12547v2-abstract-full" style="display: none;"> Language-Conditioned Robotic Grasping (LCRG) aims to develop robots that comprehend and grasp objects based on natural language instructions. While the ability to understand personal objects like my wallet facilitates more natural interaction with human users, current LCRG systems only allow generic language instructions, e.g., the black-colored wallet next to the laptop. To this end, we introduce a task scenario GraspMine alongside a novel dataset aimed at pinpointing and grasping personal objects given personal indicators via learning from a single human-robot interaction, rather than a large labeled dataset. Our proposed method, Personalized Grasping Agent (PGA), addresses GraspMine by leveraging the unlabeled image data of the user's environment, called Reminiscence. Specifically, PGA acquires personal object information by a user presenting a personal object with its associated indicator, followed by PGA inspecting the object by rotating it. Based on the acquired information, PGA pseudo-labels objects in the Reminiscence by our proposed label propagation algorithm. Harnessing the information acquired from the interactions and the pseudo-labeled objects in the Reminiscence, PGA adapts the object grounding model to grasp personal objects. This results in significant efficiency while previous LCRG systems rely on resource-intensive human annotations -- necessitating hundreds of labeled data to learn my wallet. Moreover, PGA outperforms baseline methods across all metrics and even shows comparable performance compared to the fully-supervised method, which learns from 9k annotated data samples. We further validate PGA's real-world applicability by employing a physical robot to execute GrsapMine. Code and data are publicly available at https://github.com/JHKim-snu/PGA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12547v2-abstract-full').style.display = 'none'; document.getElementById('2310.12547v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.07759">arXiv:2309.07759</a> <span> [<a href="https://arxiv.org/pdf/2309.07759">pdf</a>, <a href="https://arxiv.org/format/2309.07759">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> PROGrasp: Pragmatic Human-Robot Communication for Object Grasping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gi-Cheon Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jaein Kim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Byoung-Tak Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.07759v4-abstract-short" style="display: inline;"> Interactive Object Grasping (IOG) is the task of identifying and grasping the desired object via human-robot natural language interaction. Current IOG systems assume that a human user initially specifies the target object's category (e.g., bottle). Inspired by pragmatics, where humans often convey their intentions by relying on context to achieve goals, we introduce a new IOG task, Pragmatic-IOG,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07759v4-abstract-full').style.display = 'inline'; document.getElementById('2309.07759v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.07759v4-abstract-full" style="display: none;"> Interactive Object Grasping (IOG) is the task of identifying and grasping the desired object via human-robot natural language interaction. Current IOG systems assume that a human user initially specifies the target object's category (e.g., bottle). Inspired by pragmatics, where humans often convey their intentions by relying on context to achieve goals, we introduce a new IOG task, Pragmatic-IOG, and the corresponding dataset, Intention-oriented Multi-modal Dialogue (IM-Dial). In our proposed task scenario, an intention-oriented utterance (e.g., "I am thirsty") is initially given to the robot. The robot should then identify the target object by interacting with a human user. Based on the task setup, we propose a new robotic system that can interpret the user's intention and pick up the target object, Pragmatic Object Grasping (PROGrasp). PROGrasp performs Pragmatic-IOG by incorporating modules for visual grounding, question asking, object grasping, and most importantly, answer interpretation for pragmatic inference. Experimental results show that PROGrasp is effective in offline (i.e., target object discovery) and online (i.e., IOG with a physical robot arm) settings. Code and data are available at https://github.com/gicheonkang/prograsp. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07759v4-abstract-full').style.display = 'none'; document.getElementById('2309.07759v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICRA 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.16529">arXiv:2308.16529</a> <span> [<a href="https://arxiv.org/pdf/2308.16529">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Developing Social Robots with Empathetic Non-Verbal Cues Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+Y+K">Yoon Kyung Lee</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+Y">Yoonwon Jung</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gyuyi Kang</a>, <a href="/search/cs?searchtype=author&query=Hahn%2C+S">Sowon Hahn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.16529v1-abstract-short" style="display: inline;"> We propose augmenting the empathetic capacities of social robots by integrating non-verbal cues. Our primary contribution is the design and labeling of four types of empathetic non-verbal cues, abbreviated as SAFE: Speech, Action (gesture), Facial expression, and Emotion, in a social robot. These cues are generated using a Large Language Model (LLM). We developed an LLM-based conversational system… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16529v1-abstract-full').style.display = 'inline'; document.getElementById('2308.16529v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.16529v1-abstract-full" style="display: none;"> We propose augmenting the empathetic capacities of social robots by integrating non-verbal cues. Our primary contribution is the design and labeling of four types of empathetic non-verbal cues, abbreviated as SAFE: Speech, Action (gesture), Facial expression, and Emotion, in a social robot. These cues are generated using a Large Language Model (LLM). We developed an LLM-based conversational system for the robot and assessed its alignment with social cues as defined by human counselors. Preliminary results show distinct patterns in the robot's responses, such as a preference for calm and positive social emotions like 'joy' and 'lively', and frequent nodding gestures. Despite these tendencies, our approach has led to the development of a social robot capable of context-aware and more authentic interactions. Our work lays the groundwork for future studies on human-robot interactions, emphasizing the essential role of both verbal and non-verbal cues in creating social and empathetic robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16529v1-abstract-full').style.display = 'none'; document.getElementById('2308.16529v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> In Proceedings of 2023 IEEE International Conference on Robot & Human Interactive Communication (RO-MAN) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.05963">arXiv:2307.05963</a> <span> [<a href="https://arxiv.org/pdf/2307.05963">pdf</a>, <a href="https://arxiv.org/format/2307.05963">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GVCCI: Lifelong Learning of Visual Grounding for Language-Guided Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gi-Cheon Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jaein Kim</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+S">Suyeon Shin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Byoung-Tak Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.05963v1-abstract-short" style="display: inline;"> Language-Guided Robotic Manipulation (LGRM) is a challenging task as it requires a robot to understand human instructions to manipulate everyday objects. Recent approaches in LGRM rely on pre-trained Visual Grounding (VG) models to detect objects without adapting to manipulation environments. This results in a performance drop due to a substantial domain gap between the pre-training and real-world… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.05963v1-abstract-full').style.display = 'inline'; document.getElementById('2307.05963v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.05963v1-abstract-full" style="display: none;"> Language-Guided Robotic Manipulation (LGRM) is a challenging task as it requires a robot to understand human instructions to manipulate everyday objects. Recent approaches in LGRM rely on pre-trained Visual Grounding (VG) models to detect objects without adapting to manipulation environments. This results in a performance drop due to a substantial domain gap between the pre-training and real-world data. A straightforward solution is to collect additional training data, but the cost of human-annotation is extortionate. In this paper, we propose Grounding Vision to Ceaselessly Created Instructions (GVCCI), a lifelong learning framework for LGRM, which continuously learns VG without human supervision. GVCCI iteratively generates synthetic instruction via object detection and trains the VG model with the generated data. We validate our framework in offline and online settings across diverse environments on different VG models. Experimental results show that accumulating synthetic data from GVCCI leads to a steady improvement in VG by up to 56.7% and improves resultant LGRM by up to 29.4%. Furthermore, the qualitative analysis shows that the unadapted VG model often fails to find correct objects due to a strong bias learned from the pre-training data. Finally, we introduce a novel VG dataset for LGRM, consisting of nearly 252k triplets of image-object-instruction from diverse manipulation environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.05963v1-abstract-full').style.display = 'none'; document.getElementById('2307.05963v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at IROS2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.04422">arXiv:2307.04422</a> <span> [<a href="https://arxiv.org/pdf/2307.04422">pdf</a>, <a href="https://arxiv.org/format/2307.04422">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.robot.2024.104760">10.1016/j.robot.2024.104760 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Versatile Door Opening System with Mobile Manipulator through Adaptive Position-Force Control and Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gyuree Kang</a>, <a href="/search/cs?searchtype=author&query=Seong%2C+H">Hyunki Seong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Daegyu Lee</a>, <a href="/search/cs?searchtype=author&query=Shim%2C+D+H">D. Hyunchul Shim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.04422v1-abstract-short" style="display: inline;"> The ability of robots to navigate through doors is crucial for their effective operation in indoor environments. Consequently, extensive research has been conducted to develop robots capable of opening specific doors. However, the diverse combinations of door handles and opening directions necessitate a more versatile door opening system for robots to successfully operate in real-world environment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04422v1-abstract-full').style.display = 'inline'; document.getElementById('2307.04422v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.04422v1-abstract-full" style="display: none;"> The ability of robots to navigate through doors is crucial for their effective operation in indoor environments. Consequently, extensive research has been conducted to develop robots capable of opening specific doors. However, the diverse combinations of door handles and opening directions necessitate a more versatile door opening system for robots to successfully operate in real-world environments. In this paper, we propose a mobile manipulator system that can autonomously open various doors without prior knowledge. By using convolutional neural networks, point cloud extraction techniques, and external force measurements during exploratory motion, we obtained information regarding handle types, poses, and door characteristics. Through two different approaches, adaptive position-force control and deep reinforcement learning, we successfully opened doors without precise trajectory or excessive external force. The adaptive position-force control method involves moving the end-effector in the direction of the door opening while responding compliantly to external forces, ensuring safety and manipulator workspace. Meanwhile, the deep reinforcement learning policy minimizes applied forces and eliminates unnecessary movements, enabling stable operation across doors with different poses and widths. The RL-based approach outperforms the adaptive position-force control method in terms of compensating for external forces, ensuring smooth motion, and achieving efficient speed. It reduces the maximum force required by 3.27 times and improves motion smoothness by 1.82 times. However, the non-learning-based adaptive position-force control method demonstrates more versatility in opening a wider range of doors, encompassing revolute doors with four distinct opening directions and varying widths. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04422v1-abstract-full').style.display = 'none'; document.getElementById('2307.04422v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.00965">arXiv:2307.00965</a> <span> [<a href="https://arxiv.org/pdf/2307.00965">pdf</a>, <a href="https://arxiv.org/format/2307.00965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OpenClinicalAI: An Open and Dynamic Model for Alzheimer's Disease Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yunyou Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaoshuang Liang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xiangjiang Lu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+X">Xiuxia Miao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jiyue Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenjing Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoxin Kang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Li Ma</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Suqin Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+J">Jianfeng Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.00965v1-abstract-short" style="display: inline;"> Although Alzheimer's disease (AD) cannot be reversed or cured, timely diagnosis can significantly reduce the burden of treatment and care. Current research on AD diagnosis models usually regards the diagnosis task as a typical classification task with two primary assumptions: 1) All target categories are known a priori; 2) The diagnostic strategy for each patient is consistent, that is, the number… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.00965v1-abstract-full').style.display = 'inline'; document.getElementById('2307.00965v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.00965v1-abstract-full" style="display: none;"> Although Alzheimer's disease (AD) cannot be reversed or cured, timely diagnosis can significantly reduce the burden of treatment and care. Current research on AD diagnosis models usually regards the diagnosis task as a typical classification task with two primary assumptions: 1) All target categories are known a priori; 2) The diagnostic strategy for each patient is consistent, that is, the number and type of model input data for each patient are the same. However, real-world clinical settings are open, with complexity and uncertainty in terms of both subjects and the resources of the medical institutions. This means that diagnostic models may encounter unseen disease categories and need to dynamically develop diagnostic strategies based on the subject's specific circumstances and available medical resources. Thus, the AD diagnosis task is tangled and coupled with the diagnosis strategy formulation. To promote the application of diagnostic systems in real-world clinical settings, we propose OpenClinicalAI for direct AD diagnosis in complex and uncertain clinical settings. This is the first powerful end-to-end model to dynamically formulate diagnostic strategies and provide diagnostic results based on the subject's conditions and available medical resources. OpenClinicalAI combines reciprocally coupled deep multiaction reinforcement learning (DMARL) for diagnostic strategy formulation and multicenter meta-learning (MCML) for open-set recognition. The experimental results show that OpenClinicalAI achieves better performance and fewer clinical examinations than the state-of-the-art model. Our method provides an opportunity to embed the AD diagnostic system into the current health care system to cooperate with clinicians to improve current health care. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.00965v1-abstract-full').style.display = 'none'; document.getElementById('2307.00965v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Real-world clinical setting,Alzheimer's disease,diagnose,AI,deep learning. arXiv admin note: text overlap with arXiv:2109.04004</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14773">arXiv:2305.14773</a> <span> [<a href="https://arxiv.org/pdf/2305.14773">pdf</a>, <a href="https://arxiv.org/format/2305.14773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICRA48891.2023.10161518">10.1109/ICRA48891.2023.10161518 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Robust Imaging Sonar-based Place Recognition and Localization in Underwater Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hogyun Kim</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gilhwan Kang</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+S">Seokhwan Jeong</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Seungjun Ma</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+Y">Younggun Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14773v1-abstract-short" style="display: inline;"> Place recognition using SOund Navigation and Ranging (SONAR) images is an important task for simultaneous localization and mapping(SLAM) in underwater environments. This paper proposes a robust and efficient imaging SONAR based place recognition, SONAR context, and loop closure method. Unlike previous methods, our approach encodes geometric information based on the characteristics of raw SONAR mea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14773v1-abstract-full').style.display = 'inline'; document.getElementById('2305.14773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14773v1-abstract-full" style="display: none;"> Place recognition using SOund Navigation and Ranging (SONAR) images is an important task for simultaneous localization and mapping(SLAM) in underwater environments. This paper proposes a robust and efficient imaging SONAR based place recognition, SONAR context, and loop closure method. Unlike previous methods, our approach encodes geometric information based on the characteristics of raw SONAR measurements without prior knowledge or training. We also design a hierarchical searching procedure for fast retrieval of candidate SONAR frames and apply adaptive shifting and padding to achieve robust matching on rotation and translation changes. In addition, we can derive the initial pose through adaptive shifting and apply it to the iterative closest point (ICP) based loop closure factor. We evaluate the performance of SONAR context in the various underwater sequences such as simulated open water, real water tank, and real underwater environments. The proposed approach shows the robustness and improvements of place recognition on various datasets and evaluation metrics. Supplementary materials are available at https://github.com/sparolab/sonar_context.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14773v1-abstract-full').style.display = 'none'; document.getElementById('2305.14773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.11488">arXiv:2305.11488</a> <span> [<a href="https://arxiv.org/pdf/2305.11488">pdf</a>, <a href="https://arxiv.org/format/2305.11488">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AttriCLIP: A Non-Incremental Learner for Incremental Knowledge Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runqi Wang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+X">Xiaoyue Duan</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianzhuang Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shaohui Lin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Songcen Xu</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jinhu Lv</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Baochang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.11488v2-abstract-short" style="display: inline;"> Continual learning aims to enable a model to incrementally learn knowledge from sequentially arrived data. Previous works adopt the conventional classification architecture, which consists of a feature extractor and a classifier. The feature extractor is shared across sequentially arrived tasks or classes, but one specific group of weights of the classifier corresponding to one new class should be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11488v2-abstract-full').style.display = 'inline'; document.getElementById('2305.11488v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.11488v2-abstract-full" style="display: none;"> Continual learning aims to enable a model to incrementally learn knowledge from sequentially arrived data. Previous works adopt the conventional classification architecture, which consists of a feature extractor and a classifier. The feature extractor is shared across sequentially arrived tasks or classes, but one specific group of weights of the classifier corresponding to one new class should be incrementally expanded. Consequently, the parameters of a continual learner gradually increase. Moreover, as the classifier contains all historical arrived classes, a certain size of the memory is usually required to store rehearsal data to mitigate classifier bias and catastrophic forgetting. In this paper, we propose a non-incremental learner, named AttriCLIP, to incrementally extract knowledge of new classes or tasks. Specifically, AttriCLIP is built upon the pre-trained visual-language model CLIP. Its image encoder and text encoder are fixed to extract features from both images and text. Text consists of a category name and a fixed number of learnable parameters which are selected from our designed attribute word bank and serve as attributes. As we compute the visual and textual similarity for classification, AttriCLIP is a non-incremental learner. The attribute prompts, which encode the common knowledge useful for classification, can effectively mitigate the catastrophic forgetting and avoid constructing a replay memory. We evaluate our AttriCLIP and compare it with CLIP-based and previous state-of-the-art continual learning methods in realistic settings with domain-shift and long-sequence learning. The results show that our method performs favorably against previous state-of-the-arts. The implementation code can be available at https://github.com/bhrqw/AttriCLIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11488v2-abstract-full').style.display = 'none'; document.getElementById('2305.11488v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.07945">arXiv:2305.07945</a> <span> [<a href="https://arxiv.org/pdf/2305.07945">pdf</a>, <a href="https://arxiv.org/format/2305.07945">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning-based Data-aided Activity Detection with Extraction Network in Grant-free Sparse Code Multiple Access Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+M">Minsig Han</a>, <a href="/search/cs?searchtype=author&query=Abebe%2C+A+T">Ameha T. Abebe</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+C+G">Chung G. Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.07945v2-abstract-short" style="display: inline;"> This letter proposes a deep learning-based data-aided active user detection network (D-AUDN) for grant-free sparse code multiple access (SCMA) systems that leverages both SCMA codebook and Zadoff-Chu preamble for activity detection. Due to disparate data and preamble distribution as well as codebook collision, existing D-AUDNs experience performance degradation when multiple preambles are associat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07945v2-abstract-full').style.display = 'inline'; document.getElementById('2305.07945v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.07945v2-abstract-full" style="display: none;"> This letter proposes a deep learning-based data-aided active user detection network (D-AUDN) for grant-free sparse code multiple access (SCMA) systems that leverages both SCMA codebook and Zadoff-Chu preamble for activity detection. Due to disparate data and preamble distribution as well as codebook collision, existing D-AUDNs experience performance degradation when multiple preambles are associated with each codebook. To address this, a user activity extraction network (UAEN) is integrated within the D-AUDN to extract a-priori activity information from the codebook, improving activity detection of the associated preambles. Additionally, efficient SCMA codebook design and Zadoff-Chu preamble association are considered to further enhance performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07945v2-abstract-full').style.display = 'none'; document.getElementById('2305.07945v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.11609">arXiv:2304.11609</a> <span> [<a href="https://arxiv.org/pdf/2304.11609">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PiClick: Picking the desired mask from multiple candidates in click-based interactive segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+C">Cilin Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haochen Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaolong Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xu Tang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.11609v5-abstract-short" style="display: inline;"> Click-based interactive segmentation aims to generate target masks via human clicking, which facilitates efficient pixel-level annotation and image editing. In such a task, target ambiguity remains a problem hindering the accuracy and efficiency of segmentation. That is, in scenes with rich context, one click may correspond to multiple potential targets, while most previous interactive segmentors… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.11609v5-abstract-full').style.display = 'inline'; document.getElementById('2304.11609v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.11609v5-abstract-full" style="display: none;"> Click-based interactive segmentation aims to generate target masks via human clicking, which facilitates efficient pixel-level annotation and image editing. In such a task, target ambiguity remains a problem hindering the accuracy and efficiency of segmentation. That is, in scenes with rich context, one click may correspond to multiple potential targets, while most previous interactive segmentors only generate a single mask and fail to deal with target ambiguity. In this paper, we propose a novel interactive segmentation network named PiClick, to yield all potentially reasonable masks and suggest the most plausible one for the user. Specifically, PiClick utilizes a Transformer-based architecture to generate all potential target masks by mutually interactive mask queries. Moreover, a Target Reasoning module(TRM) is designed in PiClick to automatically suggest the user-desired mask from all candidates, relieving target ambiguity and extra-human efforts. Extensive experiments on 9 interactive segmentation datasets demonstrate PiClick performs favorably against previous state-of-the-arts considering the segmentation results. Moreover, we show that PiClick effectively reduces human efforts in annotating and picking the desired masks. To ease the usage and inspire future research, we release the source code of PiClick together with a plug-and-play annotation tool at https://github.com/cilinyan/PiClick. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.11609v5-abstract-full').style.display = 'none'; document.getElementById('2304.11609v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.05118">arXiv:2303.05118</a> <span> [<a href="https://arxiv.org/pdf/2303.05118">pdf</a>, <a href="https://arxiv.org/format/2303.05118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SLCA: Slow Learner with Classifier Alignment for Continual Learning on a Pre-trained Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gengwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yunchao Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.05118v4-abstract-short" style="display: inline;"> The goal of continual learning is to improve the performance of recognition models in learning sequentially arrived data. Although most existing works are established on the premise of learning from scratch, growing efforts have been devoted to incorporating the benefits of pre-training. However, how to adaptively exploit the pre-trained knowledge for each incremental task while maintaining its ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.05118v4-abstract-full').style.display = 'inline'; document.getElementById('2303.05118v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.05118v4-abstract-full" style="display: none;"> The goal of continual learning is to improve the performance of recognition models in learning sequentially arrived data. Although most existing works are established on the premise of learning from scratch, growing efforts have been devoted to incorporating the benefits of pre-training. However, how to adaptively exploit the pre-trained knowledge for each incremental task while maintaining its generalizability remains an open question. In this work, we present an extensive analysis for continual learning on a pre-trained model (CLPM), and attribute the key challenge to a progressive overfitting problem. Observing that selectively reducing the learning rate can almost resolve this issue in the representation layer, we propose a simple but extremely effective approach named Slow Learner with Classifier Alignment (SLCA), which further improves the classification layer by modeling the class-wise distributions and aligning the classification layers in a post-hoc fashion. Across a variety of scenarios, our proposal provides substantial improvements for CLPM (e.g., up to 49.76%, 50.05%, 44.69% and 40.16% on Split CIFAR-100, Split ImageNet-R, Split CUB-200 and Split Cars-196, respectively), and thus outperforms state-of-the-art approaches by a large margin. Based on such a strong baseline, critical factors and promising directions are analyzed in-depth to facilitate subsequent research. Code has been made available at: https://github.com/GengDavid/SLCA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.05118v4-abstract-full').style.display = 'none'; document.getElementById('2303.05118v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2023, code released</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.12954">arXiv:2302.12954</a> <span> [<a href="https://arxiv.org/pdf/2302.12954">pdf</a>, <a href="https://arxiv.org/format/2302.12954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> WPC: Whole-picture Workload Characterization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kaiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenxi Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+W">Wanling Gao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chunjie Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Z">Zhongxin Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoxin Kang</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+J">Jianfeng Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.12954v1-abstract-short" style="display: inline;"> This article raises an important and challenging workload characterization issue: can we uncover each critical component across the stacks contributing what percentages to any specific bottleneck? The typical critical components include languages, programming frameworks, runtime environments, instruction set architectures (ISA), operating systems (OS), and microarchitecture. Tackling this issue co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12954v1-abstract-full').style.display = 'inline'; document.getElementById('2302.12954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.12954v1-abstract-full" style="display: none;"> This article raises an important and challenging workload characterization issue: can we uncover each critical component across the stacks contributing what percentages to any specific bottleneck? The typical critical components include languages, programming frameworks, runtime environments, instruction set architectures (ISA), operating systems (OS), and microarchitecture. Tackling this issue could help propose a systematic methodology to guide the software and hardware co-design and critical component optimizations. We propose a whole-picture workload characterization (WPC) methodology to answer the above issue. In essence, WPC is an iterative ORFE loop consisting of four steps: Observation, Reference, Fusion, and Exploration. WPC observes different level data (observation), fuses and normalizes the performance data (fusion) with respect to the well-designed standard reference workloads suite (reference), and explores the software and hardware co-design space (exploration) to investigate the impacts of critical components across the stacks. We build and open-source the WPC tool. Our evaluations confirm WPC can quantitatively reveal the contributions of the language, framework, runtime environment, ISA, OS, and microarchitecture to the primary pipeline efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12954v1-abstract-full').style.display = 'none'; document.getElementById('2302.12954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.09927">arXiv:2302.09927</a> <span> [<a href="https://arxiv.org/pdf/2302.09927">pdf</a>, <a href="https://arxiv.org/format/2302.09927">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> NHtapDB: Native HTAP Databases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoxin Kang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Simin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+J">Jianfeng Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.09927v1-abstract-short" style="display: inline;"> Native database (1) provides a near-data machine learning framework to facilitate generating real-time business insight, and predefined change thresholds will trigger online training and deployment of new models, and (2) offers a mixed-format store to guarantee the performance of HTAP workloads, especially the hybrid workloads that consist of OLAP queries in-between online transactions. We make ri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.09927v1-abstract-full').style.display = 'inline'; document.getElementById('2302.09927v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.09927v1-abstract-full" style="display: none;"> Native database (1) provides a near-data machine learning framework to facilitate generating real-time business insight, and predefined change thresholds will trigger online training and deployment of new models, and (2) offers a mixed-format store to guarantee the performance of HTAP workloads, especially the hybrid workloads that consist of OLAP queries in-between online transactions. We make rigorous test plans for native database with an enhanced state-of-the-art HTAP benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.09927v1-abstract-full').style.display = 'none'; document.getElementById('2302.09927v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.00721">arXiv:2212.00721</a> <span> [<a href="https://arxiv.org/pdf/2212.00721">pdf</a>, <a href="https://arxiv.org/format/2212.00721">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> High fusion computers: The IoTs, edges, data centers, and humans-in-the-loop as a computer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+W">Wanling Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Jin Xiong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chunjie Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenli Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yunyou Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiping Li</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoxin Kang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chen Zheng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+B">Biwei Xie</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+S">Shaopeng Dai</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qian He</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hainan Ye</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yungang Bao</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+J">Jianfeng Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.00721v1-abstract-short" style="display: inline;"> Emerging and future applications rely heavily upon systems consisting of Internet of Things (IoT), edges, data centers, and humans-in-the-loop. Significantly different from warehouse-scale computers that serve independent concurrent user requests, this new class of computer systems directly interacts with the physical world, considering humans an essential part and performing safety-critical and m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00721v1-abstract-full').style.display = 'inline'; document.getElementById('2212.00721v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.00721v1-abstract-full" style="display: none;"> Emerging and future applications rely heavily upon systems consisting of Internet of Things (IoT), edges, data centers, and humans-in-the-loop. Significantly different from warehouse-scale computers that serve independent concurrent user requests, this new class of computer systems directly interacts with the physical world, considering humans an essential part and performing safety-critical and mission-critical operations; their computations have intertwined dependencies between not only adjacent execution loops but also actions or decisions triggered by IoTs, edge, datacenters, or humans-in-the-loop; the systems must first satisfy the accuracy metric in predicting, interpreting, or taking action before meeting the performance goal under different cases. This article argues we need a paradigm shift to reconstruct the IoTs, edges, data centers, and humans-in-the-loop as a computer rather than a distributed system. We coin a new term, high fusion computers (HFCs), to describe this class of systems. The fusion in the term has two implications: fusing IoTs, edges, data centers, and humans-in-the-loop as a computer, fusing the physical and digital worlds through HFC systems. HFC is a pivotal case of the open-source computer systems initiative. We laid out the challenges, plan, and call for uniting our community's wisdom and actions to address the HFC challenges. Everything, including the source code, will be publicly available from the project homepage: https://www.computercouncil.org/HFC/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00721v1-abstract-full').style.display = 'none'; document.getElementById('2212.00721v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been published in BenchCouncil Transactions on Benchmarks, Standards and Evaluations (TBench). Link: https://www.sciencedirect.com/science/article/pii/S277248592200062X</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> BenchCouncil Transactions on Benchmarks, Standards and Evaluations (2022) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.15180">arXiv:2211.15180</a> <span> [<a href="https://arxiv.org/pdf/2211.15180">pdf</a>, <a href="https://arxiv.org/format/2211.15180">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking the Number of Shots in Robust Model-Agnostic Meta-Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+X">Xiaoyue Duan</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Guoliang Kang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runqi Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shumin Han</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+S">Song Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tian Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Baochang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.15180v1-abstract-short" style="display: inline;"> Robust Model-Agnostic Meta-Learning (MAML) is usually adopted to train a meta-model which may fast adapt to novel classes with only a few exemplars and meanwhile remain robust to adversarial attacks. The conventional solution for robust MAML is to introduce robustness-promoting regularization during meta-training stage. With such a regularization, previous robust MAML methods simply follow the typ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.15180v1-abstract-full').style.display = 'inline'; document.getElementById('2211.15180v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.15180v1-abstract-full" style="display: none;"> Robust Model-Agnostic Meta-Learning (MAML) is usually adopted to train a meta-model which may fast adapt to novel classes with only a few exemplars and meanwhile remain robust to adversarial attacks. The conventional solution for robust MAML is to introduce robustness-promoting regularization during meta-training stage. With such a regularization, previous robust MAML methods simply follow the typical MAML practice that the number of training shots should match with the number of test shots to achieve an optimal adaptation performance. However, although the robustness can be largely improved, previous methods sacrifice clean accuracy a lot. In this paper, we observe that introducing robustness-promoting regularization into MAML reduces the intrinsic dimension of clean sample features, which results in a lower capacity of clean representations. This may explain why the clean accuracy of previous robust MAML methods drops severely. Based on this observation, we propose a simple strategy, i.e., increasing the number of training shots, to mitigate the loss of intrinsic dimension caused by robustness-promoting regularization. Though simple, our method remarkably improves the clean accuracy of MAML without much loss of robustness, producing a robust yet accurate model. Extensive experiments demonstrate that our method outperforms prior arts in achieving a better trade-off between accuracy and robustness. Besides, we observe that our method is less sensitive to the number of fine-tuning steps during meta-training, which allows for a reduced number of fine-tuning steps to improve training efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.15180v1-abstract-full').style.display = 'none'; document.getElementById('2211.15180v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.17302">arXiv:2210.17302</a> <span> [<a href="https://arxiv.org/pdf/2210.17302">pdf</a>, <a href="https://arxiv.org/format/2210.17302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Design, Field Evaluation, and Traffic Analysis of a Competitive Autonomous Driving Model in a Congested Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+D">Daegyu Lee</a>, <a href="/search/cs?searchtype=author&query=Seong%2C+H">Hyunki Seong</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Seungil Han</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+G">Gyuree Kang</a>, <a href="/search/cs?searchtype=author&query=Shim%2C+D+H">D. Hyunchul Shim</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+Y">Yoonjin Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.17302v3-abstract-short" style="display: inline;"> Recently, numerous studies have investigated cooperative traffic systems using the communication among vehicle-to-everything (V2X). Unfortunately, when multiple autonomous vehicles are deployed while exposed to communication failure, there might be a conflict of ideal conditions between various autonomous vehicles leading to adversarial situation on the roads. In South Korea, virtual and real-worl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.17302v3-abstract-full').style.display = 'inline'; document.getElementById('2210.17302v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.17302v3-abstract-full" style="display: none;"> Recently, numerous studies have investigated cooperative traffic systems using the communication among vehicle-to-everything (V2X). Unfortunately, when multiple autonomous vehicles are deployed while exposed to communication failure, there might be a conflict of ideal conditions between various autonomous vehicles leading to adversarial situation on the roads. In South Korea, virtual and real-world urban autonomous multi-vehicle races were held in March and November of 2021, respectively. During the competition, multiple vehicles were involved simultaneously, which required maneuvers such as overtaking low-speed vehicles, negotiating intersections, and obeying traffic laws. In this study, we introduce a fully autonomous driving software stack to deploy a competitive driving model, which enabled us to win the urban autonomous multi-vehicle races. We evaluate module-based systems such as navigation, perception, and planning in real and virtual environments. Additionally, an analysis of traffic is performed after collecting multiple vehicle position data over communication to gain additional insight into a multi-agent autonomous driving scenario. Finally, we propose a method for analyzing traffic in order to compare the spatial distribution of multiple autonomous vehicles. We study the similarity distribution between each team's driving log data to determine the impact of competitive autonomous driving on the traffic environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.17302v3-abstract-full').style.display = 'none'; document.getElementById('2210.17302v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.10725">arXiv:2208.10725</a> <span> [<a href="https://arxiv.org/pdf/2208.10725">pdf</a>, <a href="https://arxiv.org/format/2208.10725">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> DRL-based Distributed Resource Allocation for Edge Computing in Cell-Free Massive MIMO Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tilahun%2C+F+D">Fitsum Debebe Tilahun</a>, <a href="/search/cs?searchtype=author&query=Abebe%2C+A+T">Ameha Tsegaye Abebe</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+C+G">Chung G. Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.10725v1-abstract-short" style="display: inline;"> In this paper, with the aim of addressing the stringent computing and quality-of-service (QoS) requirements of recently introduced advanced multimedia services, we consider a cell-free massive MIMO-enabled mobile edge network. In particular, benefited from the reliable cell-free links to offload intensive computation to the edge server, resource-constrained end-users can augment on-board (local) p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.10725v1-abstract-full').style.display = 'inline'; document.getElementById('2208.10725v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.10725v1-abstract-full" style="display: none;"> In this paper, with the aim of addressing the stringent computing and quality-of-service (QoS) requirements of recently introduced advanced multimedia services, we consider a cell-free massive MIMO-enabled mobile edge network. In particular, benefited from the reliable cell-free links to offload intensive computation to the edge server, resource-constrained end-users can augment on-board (local) processing with edge computing. To this end, we formulate a joint communication and computing resource allocation (JCCRA) problem to minimize the total energy consumption of the users, while meeting the respective user-specific deadlines. To tackle the problem, we propose a fully distributed solution approach based on cooperative multi-agent reinforcement learning framework, wherein each user is implemented as a learning agent to make joint resource allocation relying on local information only. The simulation results demonstrate that the performance of the proposed distributed approach outperforms the heuristic baselines, converging to a centralized target benchmark, without resorting to large overhead. Moreover, we showed that the proposed algorithm has performed significantly better in cell-free system as compared with the cellular MEC systems, e.g., a small cell-based MEC system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.10725v1-abstract-full').style.display = 'none'; document.getElementById('2208.10725v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figures, conference. arXiv admin note: substantial text overlap with arXiv:2201.09057</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Kang%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Kang%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Kang%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository