CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 178 results for author: <span class="mathjax">Yoo, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Yoo%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yoo, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yoo%2C+J&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yoo, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Yoo%2C+J&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Yoo%2C+J&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yoo%2C+J&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yoo%2C+J&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yoo%2C+J&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13607">arXiv:2411.13607</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13607">pdf</a>, <a href="https://arxiv.org/format/2411.13607">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VioPose: Violin Performance 4D Pose Estimation by Hierarchical Audiovisual Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+S+J">Seong Jong Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Shrestha%2C+S">Snehesh Shrestha</a>, <a href="/search/cs?searchtype=author&amp;query=Muresanu%2C+I">Irina Muresanu</a>, <a href="/search/cs?searchtype=author&amp;query=Ferm%C3%BCller%2C+C">Cornelia Ferm眉ller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13607v1-abstract-short" style="display: inline;"> Musicians delicately control their bodies to generate music. Sometimes, their motions are too subtle to be captured by the human eye. To analyze how they move to produce the music, we need to estimate precise 4D human pose (3D pose over time). However, current state-of-the-art (SoTA) visual pose estimation algorithms struggle to produce accurate monocular 4D poses because of occlusions, partial vi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13607v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13607v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13607v1-abstract-full" style="display: none;"> Musicians delicately control their bodies to generate music. Sometimes, their motions are too subtle to be captured by the human eye. To analyze how they move to produce the music, we need to estimate precise 4D human pose (3D pose over time). However, current state-of-the-art (SoTA) visual pose estimation algorithms struggle to produce accurate monocular 4D poses because of occlusions, partial views, and human-object interactions. They are limited by the viewing angle, pixel density, and sampling rate of the cameras and fail to estimate fast and subtle movements, such as in the musical effect of vibrato. We leverage the direct causal relationship between the music produced and the human motions creating them to address these challenges. We propose VioPose: a novel multimodal network that hierarchically estimates dynamics. High-level features are cascaded to low-level features and integrated into Bayesian updates. Our architecture is shown to produce accurate pose sequences, facilitating precise motion analysis, and outperforms SoTA. As part of this work, we collected the largest and the most diverse calibrated violin-playing dataset, including video, sound, and 3D motion capture poses. Project page: is available at https://sj-yoo.info/viopose/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13607v1-abstract-full').style.display = 'none'; document.getElementById('2411.13607v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV 2025 in Round 1. First two authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22918">arXiv:2410.22918</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.22918">pdf</a>, <a href="https://arxiv.org/format/2410.22918">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Simulation-Free Training of Neural ODEs on Paired Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Semin Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaehoon Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J">Jinwoo Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Cha%2C+Y">Yeonwoo Cha</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Saehoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+S">Seunghoon Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22918v1-abstract-short" style="display: inline;"> In this work, we investigate a method for simulation-free training of Neural Ordinary Differential Equations (NODEs) for learning deterministic mappings between paired data. Despite the analogy of NODEs as continuous-depth residual networks, their application in typical supervised learning tasks has not been popular, mainly due to the large number of function evaluations required by ODE solvers an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22918v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22918v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22918v1-abstract-full" style="display: none;"> In this work, we investigate a method for simulation-free training of Neural Ordinary Differential Equations (NODEs) for learning deterministic mappings between paired data. Despite the analogy of NODEs as continuous-depth residual networks, their application in typical supervised learning tasks has not been popular, mainly due to the large number of function evaluations required by ODE solvers and numerical instability in gradient estimation. To alleviate this problem, we employ the flow matching framework for simulation-free training of NODEs, which directly regresses the parameterized dynamics function to a predefined target velocity field. Contrary to generative tasks, however, we show that applying flow matching directly between paired data can often lead to an ill-defined flow that breaks the coupling of the data pairs (e.g., due to crossing trajectories). We propose a simple extension that applies flow matching in the embedding space of data pairs, where the embeddings are learned jointly with the dynamic function to ensure the validity of the flow which is also easier to learn. We demonstrate the effectiveness of our method on both regression and classification tasks, where our method outperforms existing NODEs with a significantly lower number of function evaluations. The code is available at https://github.com/seminkim/simulation-free-node. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22918v1-abstract-full').style.display = 'none'; document.getElementById('2410.22918v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20366">arXiv:2410.20366</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20366">pdf</a>, <a href="https://arxiv.org/format/2410.20366">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Reconstruction-based Graph-Level Anomaly Detection: Limitations and a Simple Remedy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sunwoo Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+S+Y">Soo Yong Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Bu%2C+F">Fanchen Bu</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+S">Shinhwan Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+K">Kyungho Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaemin Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Shin%2C+K">Kijung Shin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20366v1-abstract-short" style="display: inline;"> Graph autoencoders (Graph-AEs) learn representations of given graphs by aiming to accurately reconstruct them. A notable application of Graph-AEs is graph-level anomaly detection (GLAD), whose objective is to identify graphs with anomalous topological structures and/or node features compared to the majority of the graph population. Graph-AEs for GLAD regard a graph with a high mean reconstruction&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20366v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20366v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20366v1-abstract-full" style="display: none;"> Graph autoencoders (Graph-AEs) learn representations of given graphs by aiming to accurately reconstruct them. A notable application of Graph-AEs is graph-level anomaly detection (GLAD), whose objective is to identify graphs with anomalous topological structures and/or node features compared to the majority of the graph population. Graph-AEs for GLAD regard a graph with a high mean reconstruction error (i.e. mean of errors from all node pairs and/or nodes) as anomalies. Namely, the methods rest on the assumption that they would better reconstruct graphs with similar characteristics to the majority. We, however, report non-trivial counter-examples, a phenomenon we call reconstruction flip, and highlight the limitations of the existing Graph-AE-based GLAD methods. Specifically, we empirically and theoretically investigate when this assumption holds and when it fails. Through our analyses, we further argue that, while the reconstruction errors for a given graph are effective features for GLAD, leveraging the multifaceted summaries of the reconstruction errors, beyond just mean, can further strengthen the features. Thus, we propose a novel and simple GLAD method, named MUSE. The key innovation of MUSE involves taking multifaceted summaries of reconstruction errors as graph features for GLAD. This surprisingly simple method obtains SOTA performance in GLAD, performing best overall among 14 methods across 10 datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20366v1-abstract-full').style.display = 'none'; document.getElementById('2410.20366v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02646">arXiv:2410.02646</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02646">pdf</a>, <a href="https://arxiv.org/format/2410.02646">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning 3D Perception from Others&#39; Predictions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jinsu Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+Z">Zhenyang Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+T">Tai-Yu Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Yihong Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Phoo%2C+C+P">Cheng Perng Phoo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiangyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Campbell%2C+M">Mark Campbell</a>, <a href="/search/cs?searchtype=author&amp;query=Weinberger%2C+K+Q">Kilian Q. Weinberger</a>, <a href="/search/cs?searchtype=author&amp;query=Hariharan%2C+B">Bharath Hariharan</a>, <a href="/search/cs?searchtype=author&amp;query=Chao%2C+W">Wei-Lun Chao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02646v2-abstract-short" style="display: inline;"> Accurate 3D object detection in real-world environments requires a huge amount of annotated data with high quality. Acquiring such data is tedious and expensive, and often needs repeated effort when a new sensor is adopted or when the detector is deployed in a new environment. We investigate a new scenario to construct 3D object detectors: learning from the predictions of a nearby unit that is equ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02646v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02646v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02646v2-abstract-full" style="display: none;"> Accurate 3D object detection in real-world environments requires a huge amount of annotated data with high quality. Acquiring such data is tedious and expensive, and often needs repeated effort when a new sensor is adopted or when the detector is deployed in a new environment. We investigate a new scenario to construct 3D object detectors: learning from the predictions of a nearby unit that is equipped with an accurate detector. For example, when a self-driving car enters a new area, it may learn from other traffic participants whose detectors have been optimized for that area. This setting is label-efficient, sensor-agnostic, and communication-efficient: nearby units only need to share the predictions with the ego agent (e.g., car). Naively using the received predictions as ground-truths to train the detector for the ego car, however, leads to inferior performance. We systematically study the problem and identify viewpoint mismatches and mislocalization (due to synchronization and GPS errors) as the main causes, which unavoidably result in false positives, false negatives, and inaccurate pseudo labels. We propose a distance-based curriculum, first learning from closer units with similar viewpoints and subsequently improving the quality of other units&#39; predictions via self-training. We further demonstrate that an effective pseudo label refinement module can be trained with a handful of annotated data, largely reducing the data quantity necessary to train an object detector. We validate our approach on the recently released real-world collaborative driving dataset, using reference cars&#39; predictions as pseudo labels for the ego car. Extensive experiments including several scenarios (e.g., different sensors, detectors, and domains) demonstrate the effectiveness of our approach toward label-efficient learning of 3D perception from other units&#39; predictions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02646v2-abstract-full').style.display = 'none'; document.getElementById('2410.02646v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11008">arXiv:2408.11008</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.11008">pdf</a>, <a href="https://arxiv.org/format/2408.11008">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Towards a Standardized Representation for Deep Learning Collective Algorithms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jinsun Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Won%2C+W">William Won</a>, <a href="/search/cs?searchtype=author&amp;query=Cowan%2C+M">Meghan Cowan</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+N">Nan Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Klenk%2C+B">Benjamin Klenk</a>, <a href="/search/cs?searchtype=author&amp;query=Sridharan%2C+S">Srinivas Sridharan</a>, <a href="/search/cs?searchtype=author&amp;query=Krishna%2C+T">Tushar Krishna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11008v1-abstract-short" style="display: inline;"> The explosion of machine learning model size has led to its execution on distributed clusters at a very large scale. Many works have tried to optimize the process of producing collective algorithms and running collective communications, which act as a bottleneck to distributed machine learning. However, different works use their own collective algorithm representation, pushing away from co-optimiz&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11008v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11008v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11008v1-abstract-full" style="display: none;"> The explosion of machine learning model size has led to its execution on distributed clusters at a very large scale. Many works have tried to optimize the process of producing collective algorithms and running collective communications, which act as a bottleneck to distributed machine learning. However, different works use their own collective algorithm representation, pushing away from co-optimizing collective communication and the rest of the workload. The lack of a standardized collective algorithm representation has also hindered interoperability between collective algorithm producers and consumers. Additionally, tool-specific conversions and modifications have to be made for each pair of tools producing and consuming collective algorithms which adds to engineering efforts. In this position paper, we propose a standardized workflow leveraging a common collective algorithm representation. Upstream producers and downstream consumers converge to a common representation format based on Chakra Execution Trace, a commonly used graph based representation of distributed machine learning workloads. Such a common representation enables us to view collective communications at the same level as workload operations and decouple producer and consumer tools, enhance interoperability, and relieve the user from the burden of having to focus on downstream implementations. We provide a proof-of-concept of this standardized workflow by simulating collective algorithms generated by the MSCCLang domain-specific language through the ASTRA-sim distributed machine learning simulator using various network configurations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11008v1-abstract-full').style.display = 'none'; document.getElementById('2408.11008v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05918">arXiv:2408.05918</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.05918">pdf</a>, <a href="https://arxiv.org/format/2408.05918">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PAFormer: Part Aware Transformer for Person Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jung%2C+H">Hyeono Jung</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">Jangwon Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jiwon Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Ko%2C+D">Dami Ko</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+G">Gyeonghwan Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05918v1-abstract-short" style="display: inline;"> Within the domain of person re-identification (ReID), partial ReID methods are considered mainstream, aiming to measure feature distances through comparisons of body parts between samples. However, in practice, previous methods often lack sufficient awareness of anatomical aspect of body parts, resulting in the failure to capture features of the same body parts across different samples. To address&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05918v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05918v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05918v1-abstract-full" style="display: none;"> Within the domain of person re-identification (ReID), partial ReID methods are considered mainstream, aiming to measure feature distances through comparisons of body parts between samples. However, in practice, previous methods often lack sufficient awareness of anatomical aspect of body parts, resulting in the failure to capture features of the same body parts across different samples. To address this issue, we introduce \textbf{Part Aware Transformer (PAFormer)}, a pose estimation based ReID model which can perform precise part-to-part comparison. In order to inject part awareness to pose tokens, we introduce learnable parameters called `pose token&#39; which estimate the correlation between each body part and partial regions of the image. Notably, at inference phase, PAFormer operates without additional modules related to body part localization, which is commonly used in previous ReID methodologies leveraging pose estimation models. Additionally, leveraging the enhanced awareness of body parts, PAFormer suggests the use of a learning-based visibility predictor to estimate the degree of occlusion for each body part. Also, we introduce a teacher forcing technique using ground truth visibility scores which enables PAFormer to be trained only with visible parts. A set of extensive experiments show that our method outperforms existing approaches on well-known ReID benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05918v1-abstract-full').style.display = 'none'; document.getElementById('2408.05918v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00994">arXiv:2408.00994</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.00994">pdf</a>, <a href="https://arxiv.org/format/2408.00994">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ArchCode: Incorporating Software Requirements in Code Generation with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Han%2C+H">Hojae Han</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J">Jaejin Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaeseok Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+Y">Youngwon Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Hwang%2C+S">Seung-won Hwang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00994v1-abstract-short" style="display: inline;"> This paper aims to extend the code generation capability of large language models (LLMs) to automatically manage comprehensive software requirements from given textual descriptions. Such requirements include both functional (i.e. achieving expected behavior for inputs) and non-functional (e.g., time/space performance, robustness, maintainability) requirements. However, textual descriptions can eit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00994v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00994v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00994v1-abstract-full" style="display: none;"> This paper aims to extend the code generation capability of large language models (LLMs) to automatically manage comprehensive software requirements from given textual descriptions. Such requirements include both functional (i.e. achieving expected behavior for inputs) and non-functional (e.g., time/space performance, robustness, maintainability) requirements. However, textual descriptions can either express requirements verbosely or may even omit some of them. We introduce ARCHCODE, a novel framework that leverages in-context learning to organize requirements observed in descriptions and to extrapolate unexpressed requirements from them. ARCHCODE generates requirements from given descriptions, conditioning them to produce code snippets and test cases. Each test case is tailored to one of the requirements, allowing for the ranking of code snippets based on the compliance of their execution results with the requirements. Public benchmarks show that ARCHCODE enhances to satisfy functional requirements, significantly improving Pass@k scores. Furthermore, we introduce HumanEval-NFR, the first evaluation of LLMs&#39; non-functional requirements in code generation, demonstrating ARCHCODE&#39;s superiority over baseline methods. The implementation of ARCHCODE and the HumanEval-NFR benchmark are both publicly accessible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00994v1-abstract-full').style.display = 'none'; document.getElementById('2408.00994v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2024 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19871">arXiv:2407.19871</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.19871">pdf</a>, <a href="https://arxiv.org/ps/2407.19871">ps</a>, <a href="https://arxiv.org/format/2407.19871">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Fast Private Location-based Information Retrieval Over the Torus </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J+S">Joon Soo Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+M+Y">Mi Yeon Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Heo%2C+J+W">Ji Won Heo</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+K+H">Kang Hoon Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Yoon%2C+J+W">Ji Won Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19871v1-abstract-short" style="display: inline;"> Location-based services offer immense utility, but also pose significant privacy risks. In response, we propose LocPIR, a novel framework using homomorphic encryption (HE), specifically the TFHE scheme, to preserve user location privacy when retrieving data from public clouds. Our system employs TFHE&#39;s expertise in non-polynomial evaluations, crucial for comparison operations. LocPIR showcases min&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19871v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19871v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19871v1-abstract-full" style="display: none;"> Location-based services offer immense utility, but also pose significant privacy risks. In response, we propose LocPIR, a novel framework using homomorphic encryption (HE), specifically the TFHE scheme, to preserve user location privacy when retrieving data from public clouds. Our system employs TFHE&#39;s expertise in non-polynomial evaluations, crucial for comparison operations. LocPIR showcases minimal client-server interaction, reduced memory overhead, and efficient throughput. Performance tests confirm its computational speed, making it a viable solution for practical scenarios, demonstrated via application to a COVID-19 alert model. Thus, LocPIR effectively addresses privacy concerns in location-based services, enabling secure data sharing from the public cloud. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19871v1-abstract-full').style.display = 'none'; document.getElementById('2407.19871v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the IEEE International Conference on Advanced Video and Signal-Based Surveillance (AVSS) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10296">arXiv:2406.10296</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.10296">pdf</a>, <a href="https://arxiv.org/format/2406.10296">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> CLST: Cold-Start Mitigation in Knowledge Tracing by Aligning a Generative Language Model as a Students&#39; Knowledge Tracer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jung%2C+H">Heeseok Jung</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaesang Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Yoon%2C+Y">Yohaan Yoon</a>, <a href="/search/cs?searchtype=author&amp;query=Jang%2C+Y">Yeonju Jang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10296v2-abstract-short" style="display: inline;"> Knowledge tracing (KT), wherein students&#39; problem-solving histories are used to estimate their current levels of knowledge, has attracted significant interest from researchers. However, most existing KT models were developed with an ID-based paradigm, which exhibits limitations in cold-start performance. These limitations can be mitigated by leveraging the vast quantities of external knowledge pos&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10296v2-abstract-full').style.display = 'inline'; document.getElementById('2406.10296v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10296v2-abstract-full" style="display: none;"> Knowledge tracing (KT), wherein students&#39; problem-solving histories are used to estimate their current levels of knowledge, has attracted significant interest from researchers. However, most existing KT models were developed with an ID-based paradigm, which exhibits limitations in cold-start performance. These limitations can be mitigated by leveraging the vast quantities of external knowledge possessed by generative large language models (LLMs). In this study, we propose cold-start mitigation in knowledge tracing by aligning a generative language model as a students&#39; knowledge tracer (CLST) as a framework that utilizes a generative LLM as a knowledge tracer. Upon collecting data from math, social studies, and science subjects, we framed the KT task as a natural language processing task, wherein problem-solving data are expressed in natural language, and fine-tuned the generative LLM using the formatted KT dataset. Subsequently, we evaluated the performance of the CLST in situations of data scarcity using various baseline models for comparison. The results indicate that the CLST significantly enhanced performance with a dataset of fewer than 100 students in terms of prediction, reliability, and cross-domain generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10296v2-abstract-full').style.display = 'none'; document.getElementById('2406.10296v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09716">arXiv:2406.09716</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.09716">pdf</a>, <a href="https://arxiv.org/ps/2406.09716">ps</a>, <a href="https://arxiv.org/format/2406.09716">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Speed-up of Data Analysis with Kernel Trick in Encrypted Domain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J+S">Joon Soo Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+B+K">Baek Kyung Song</a>, <a href="/search/cs?searchtype=author&amp;query=Ahn%2C+T+M">Tae Min Ahn</a>, <a href="/search/cs?searchtype=author&amp;query=Heo%2C+J+W">Ji Won Heo</a>, <a href="/search/cs?searchtype=author&amp;query=Yoon%2C+J+W">Ji Won Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09716v1-abstract-short" style="display: inline;"> Homomorphic encryption (HE) is pivotal for secure computation on encrypted data, crucial in privacy-preserving data analysis. However, efficiently processing high-dimensional data in HE, especially for machine learning and statistical (ML/STAT) algorithms, poses a challenge. In this paper, we present an effective acceleration method using the kernel method for HE schemes, enhancing time performanc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09716v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09716v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09716v1-abstract-full" style="display: none;"> Homomorphic encryption (HE) is pivotal for secure computation on encrypted data, crucial in privacy-preserving data analysis. However, efficiently processing high-dimensional data in HE, especially for machine learning and statistical (ML/STAT) algorithms, poses a challenge. In this paper, we present an effective acceleration method using the kernel method for HE schemes, enhancing time performance in ML/STAT algorithms within encrypted domains. This technique, independent of underlying HE mechanisms and complementing existing optimizations, notably reduces costly HE multiplications, offering near constant time complexity relative to data dimension. Aimed at accessibility, this method is tailored for data scientists and developers with limited cryptography background, facilitating advanced data analysis in secure environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09716v1-abstract-full').style.display = 'none'; document.getElementById('2406.09716v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted as a preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04814">arXiv:2406.04814</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.04814">pdf</a>, <a href="https://arxiv.org/format/2406.04814">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Online Continual Learning of Video Diffusion Models From a Single Video Stream </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jason Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Green%2C+D">Dylan Green</a>, <a href="/search/cs?searchtype=author&amp;query=Pleiss%2C+G">Geoff Pleiss</a>, <a href="/search/cs?searchtype=author&amp;query=Wood%2C+F">Frank Wood</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04814v1-abstract-short" style="display: inline;"> Diffusion models have shown exceptional capabilities in generating realistic videos. Yet, their training has been predominantly confined to offline environments where models can repeatedly train on i.i.d. data to convergence. This work explores the feasibility of training diffusion models from a semantically continuous video stream, where correlated video frames sequentially arrive one at a time.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04814v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04814v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04814v1-abstract-full" style="display: none;"> Diffusion models have shown exceptional capabilities in generating realistic videos. Yet, their training has been predominantly confined to offline environments where models can repeatedly train on i.i.d. data to convergence. This work explores the feasibility of training diffusion models from a semantically continuous video stream, where correlated video frames sequentially arrive one at a time. To investigate this, we introduce two novel continual video generative modeling benchmarks, Lifelong Bouncing Balls and Windows 95 Maze Screensaver, each containing over a million video frames generated from navigating stationary environments. Surprisingly, our experiments show that diffusion models can be effectively trained online using experience replay, achieving performance comparable to models trained with i.i.d. samples given the same number of gradient steps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04814v1-abstract-full').style.display = 'none'; document.getElementById('2406.04814v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01045">arXiv:2406.01045</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.01045">pdf</a>, <a href="https://arxiv.org/format/2406.01045">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Decompose, Enrich, and Extract! Schema-aware Event Extraction using LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shiri%2C+F">Fatemeh Shiri</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+V">Van Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Moghimifar%2C+F">Farhad Moghimifar</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">John Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Haffari%2C+G">Gholamreza Haffari</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuan-Fang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01045v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) demonstrate significant capabilities in processing natural language data, promising efficient knowledge extraction from diverse textual sources to enhance situational awareness and support decision-making. However, concerns arise due to their susceptibility to hallucination, resulting in contextually inaccurate content. This work focuses on harnessing LLMs for automate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01045v1-abstract-full').style.display = 'inline'; document.getElementById('2406.01045v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01045v1-abstract-full" style="display: none;"> Large Language Models (LLMs) demonstrate significant capabilities in processing natural language data, promising efficient knowledge extraction from diverse textual sources to enhance situational awareness and support decision-making. However, concerns arise due to their susceptibility to hallucination, resulting in contextually inaccurate content. This work focuses on harnessing LLMs for automated Event Extraction, introducing a new method to address hallucination by decomposing the task into Event Detection and Event Argument Extraction. Moreover, the proposed method integrates dynamic schema-aware augmented retrieval examples into prompts tailored for each specific inquiry, thereby extending and adapting advanced prompting techniques such as Retrieval-Augmented Generation. Evaluation findings on prominent event extraction benchmarks and results from a synthesized benchmark illustrate the method&#39;s superior performance compared to baseline approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01045v1-abstract-full').style.display = 'none'; document.getElementById('2406.01045v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.11614">arXiv:2405.11614</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.11614">pdf</a>, <a href="https://arxiv.org/format/2405.11614">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Nickel and Diming Your GAN: A Dual-Method Approach to Enhancing GAN Efficiency via Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yeo%2C+S">Sangyeop Yeo</a>, <a href="/search/cs?searchtype=author&amp;query=Jang%2C+Y">Yoojin Jang</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaejun Yoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.11614v2-abstract-short" style="display: inline;"> In this paper, we address the challenge of compressing generative adversarial networks (GANs) for deployment in resource-constrained environments by proposing two novel methodologies: Distribution Matching for Efficient compression (DiME) and Network Interactive Compression via Knowledge Exchange and Learning (NICKEL). DiME employs foundation models as embedding kernels for efficient distribution&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11614v2-abstract-full').style.display = 'inline'; document.getElementById('2405.11614v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.11614v2-abstract-full" style="display: none;"> In this paper, we address the challenge of compressing generative adversarial networks (GANs) for deployment in resource-constrained environments by proposing two novel methodologies: Distribution Matching for Efficient compression (DiME) and Network Interactive Compression via Knowledge Exchange and Learning (NICKEL). DiME employs foundation models as embedding kernels for efficient distribution matching, leveraging maximum mean discrepancy to facilitate effective knowledge distillation. Simultaneously, NICKEL employs an interactive compression method that enhances the communication between the student generator and discriminator, achieving a balanced and stable compression process. Our comprehensive evaluation on the StyleGAN2 architecture with the FFHQ dataset shows the effectiveness of our approach, with NICKEL &amp; DiME achieving FID scores of 10.45 and 15.93 at compression rates of 95.73% and 98.92%, respectively. Remarkably, our methods sustain generative quality even at an extreme compression rate of 99.69%, surpassing the previous state-of-the-art performance by a large margin. These findings not only demonstrate our methodologies&#39; capacity to significantly lower GANs&#39; computational demands but also pave the way for deploying high-quality GAN models in settings with limited resources. Our code will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11614v2-abstract-full').style.display = 'none'; document.getElementById('2405.11614v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00646">arXiv:2405.00646</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.00646">pdf</a>, <a href="https://arxiv.org/format/2405.00646">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning to Compose: Improving Object Centric Learning by Injecting Compositionality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jung%2C+W">Whie Jung</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaehoon Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Ahn%2C+S">Sungjin Ahn</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+S">Seunghoon Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00646v1-abstract-short" style="display: inline;"> Learning compositional representation is a key aspect of object-centric learning as it enables flexible systematic generalization and supports complex visual reasoning. However, most of the existing approaches rely on auto-encoding objective, while the compositionality is implicitly imposed by the architectural or algorithmic bias in the encoder. This misalignment between auto-encoding objective a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00646v1-abstract-full').style.display = 'inline'; document.getElementById('2405.00646v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00646v1-abstract-full" style="display: none;"> Learning compositional representation is a key aspect of object-centric learning as it enables flexible systematic generalization and supports complex visual reasoning. However, most of the existing approaches rely on auto-encoding objective, while the compositionality is implicitly imposed by the architectural or algorithmic bias in the encoder. This misalignment between auto-encoding objective and learning compositionality often results in failure of capturing meaningful object representations. In this study, we propose a novel objective that explicitly encourages compositionality of the representations. Built upon the existing object-centric learning framework (e.g., slot attention), our method incorporates additional constraints that an arbitrary mixture of object representations from two images should be valid by maximizing the likelihood of the composite data. We demonstrate that incorporating our objective to the existing framework consistently improves the objective-centric learning and enhances the robustness to the architectural choices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00646v1-abstract-full').style.display = 'none'; document.getElementById('2405.00646v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.18066">arXiv:2404.18066</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.18066">pdf</a>, <a href="https://arxiv.org/format/2404.18066">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Quantized Context Based LIF Neurons for Recurrent Spiking Neural Networks in 45nm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bezugam%2C+S+S">Sai Sukruth Bezugam</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yihao Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">JaeBum Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Strukov%2C+D">Dmitri Strukov</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+B">Bongjin Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.18066v1-abstract-short" style="display: inline;"> In this study, we propose the first hardware implementation of a context-based recurrent spiking neural network (RSNN) emphasizing on integrating dual information streams within the neocortical pyramidal neurons specifically Context- Dependent Leaky Integrate and Fire (CLIF) neuron models, essential element in RSNN. We present a quantized version of the CLIF neuron (qCLIF), developed through a har&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18066v1-abstract-full').style.display = 'inline'; document.getElementById('2404.18066v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.18066v1-abstract-full" style="display: none;"> In this study, we propose the first hardware implementation of a context-based recurrent spiking neural network (RSNN) emphasizing on integrating dual information streams within the neocortical pyramidal neurons specifically Context- Dependent Leaky Integrate and Fire (CLIF) neuron models, essential element in RSNN. We present a quantized version of the CLIF neuron (qCLIF), developed through a hardware-software codesign approach utilizing the sparse activity of RSNN. Implemented in a 45nm technology node, the qCLIF is compact (900um^2) and achieves a high accuracy of 90% despite 8 bit quantization on DVS gesture classification dataset. Our analysis spans a network configuration from 10 to 200 qCLIF neurons, supporting up to 82k synapses within a 1.86 mm^2 footprint, demonstrating scalability and efficiency <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18066v1-abstract-full').style.display = 'none'; document.getElementById('2404.18066v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 Pages, 7 Figures, 2 Tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15333">arXiv:2404.15333</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.15333">pdf</a>, <a href="https://arxiv.org/format/2404.15333">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EB-GAME: A Game-Changer in ECG Heartbeat Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Park%2C+J">JuneYoung Park</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+D+Y">Da Young Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+Y">Yunsoo Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jisu Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+T+J">Tae Joon Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15333v1-abstract-short" style="display: inline;"> Cardiologists use electrocardiograms (ECG) for the detection of arrhythmias. However, continuous monitoring of ECG signals to detect cardiac abnormal-ities requires significant time and human resources. As a result, several deep learning studies have been conducted in advance for the automatic detection of arrhythmia. These models show relatively high performance in supervised learning, but are no&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15333v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15333v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15333v1-abstract-full" style="display: none;"> Cardiologists use electrocardiograms (ECG) for the detection of arrhythmias. However, continuous monitoring of ECG signals to detect cardiac abnormal-ities requires significant time and human resources. As a result, several deep learning studies have been conducted in advance for the automatic detection of arrhythmia. These models show relatively high performance in supervised learning, but are not applicable in cases with few training examples. This is because abnormal ECG data is scarce compared to normal data in most real-world clinical settings. Therefore, in this study, GAN-based anomaly detec-tion, i.e., unsupervised learning, was employed to address the issue of data imbalance. This paper focuses on detecting abnormal signals in electrocardi-ograms (ECGs) using only labels from normal signals as training data. In-spired by self-supervised vision transformers, which learn by dividing images into patches, and masked auto-encoders, known for their effectiveness in patch reconstruction and solving information redundancy, we introduce the ECG Heartbeat Anomaly Detection model, EB-GAME. EB-GAME was trained and validated on the MIT-BIH Arrhythmia Dataset, where it achieved state-of-the-art performance on this benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15333v1-abstract-full').style.display = 'none'; document.getElementById('2404.15333v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09161">arXiv:2404.09161</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.09161">pdf</a>, <a href="https://arxiv.org/format/2404.09161">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Coreset Selection for Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hojun Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Suyoung Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">Junhoo Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaeyoung Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Kwak%2C+N">Nojun Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09161v1-abstract-short" style="display: inline;"> Coreset selection is a method for selecting a small, representative subset of an entire dataset. It has been primarily researched in image classification, assuming there is only one object per image. However, coreset selection for object detection is more challenging as an image can contain multiple objects. As a result, much research has yet to be done on this topic. Therefore, we introduce a new&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09161v1-abstract-full').style.display = 'inline'; document.getElementById('2404.09161v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09161v1-abstract-full" style="display: none;"> Coreset selection is a method for selecting a small, representative subset of an entire dataset. It has been primarily researched in image classification, assuming there is only one object per image. However, coreset selection for object detection is more challenging as an image can contain multiple objects. As a result, much research has yet to be done on this topic. Therefore, we introduce a new approach, Coreset Selection for Object Detection (CSOD). CSOD generates imagewise and classwise representative feature vectors for multiple objects of the same class within each image. Subsequently, we adopt submodular optimization for considering both representativeness and diversity and utilize the representative vectors in the submodular optimization process to select a subset. When we evaluated CSOD on the Pascal VOC dataset, CSOD outperformed random selection by +6.4%p in AP$_{50}$ when selecting 200 images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09161v1-abstract-full').style.display = 'none'; document.getElementById('2404.09161v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2024: 1st Workshop on Dataset Distillation for Computer Vision</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03155">arXiv:2404.03155</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.03155">pdf</a>, <a href="https://arxiv.org/format/2404.03155">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> TEGRA -- Scaling Up Terascale Graph Processing with Disaggregated Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shaddix%2C+W">William Shaddix</a>, <a href="/search/cs?searchtype=author&amp;query=Samani%2C+M">Mahyar Samani</a>, <a href="/search/cs?searchtype=author&amp;query=Fariborz%2C+M">Marjan Fariborz</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+S+J+B">S. J. Ben Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Lowe-Power%2C+J">Jason Lowe-Power</a>, <a href="/search/cs?searchtype=author&amp;query=Akella%2C+V">Venkatesh Akella</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03155v1-abstract-short" style="display: inline;"> Graphs are essential for representing relationships in various domains, driving modern AI applications such as graph analytics and neural networks across science, engineering, cybersecurity, transportation, and economics. However, the size of modern graphs are rapidly expanding, posing challenges for traditional CPUs and GPUs in meeting real-time processing demands. As a result, hardware accelerat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03155v1-abstract-full').style.display = 'inline'; document.getElementById('2404.03155v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03155v1-abstract-full" style="display: none;"> Graphs are essential for representing relationships in various domains, driving modern AI applications such as graph analytics and neural networks across science, engineering, cybersecurity, transportation, and economics. However, the size of modern graphs are rapidly expanding, posing challenges for traditional CPUs and GPUs in meeting real-time processing demands. As a result, hardware accelerators for graph processing have been proposed. However, the largest graphs that can be handled by these systems is still modest often targeting Twitter graph(1.4B edges approximately). This paper aims to address this limitation by developing a graph accelerator capable of terascale graph processing. Scale out architectures, architectures where nodes are replicated to expand to larger datasets, are natural for handling larger graphs. We argue that this approach is not appropriate for very large-scale graphs because it leads to under utilization of both memory resources and compute resources. Additionally, vertex and edge processing have different access patterns. Communication overheads also pose further challenges in designing scalable architectures. To overcome these issues, this paper proposes TEGRA, a scale-up architecture for terascale graph processing. TEGRA leverages a composable computing system with disaggregated resources and a communication architecture inspired by Active Messages. By employing direct communication between cores and optimizing memory interconnect utilization, TEGRA effectively reduces communication overhead and improves resource utilization, therefore enabling efficient processing of terascale graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03155v1-abstract-full').style.display = 'none'; document.getElementById('2404.03155v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at the 3rd Workshop on Heterogeneous Composable and Disaggregated Systems (HCDS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.02865">arXiv:2404.02865</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.02865">pdf</a>, <a href="https://arxiv.org/format/2404.02865">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> End-To-End Self-tuning Self-supervised Time Series Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deforce%2C+B">Boje Deforce</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+M">Meng-Chieh Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Baesens%2C+B">Bart Baesens</a>, <a href="/search/cs?searchtype=author&amp;query=Asensio%2C+E+S">Estefan铆a Serral Asensio</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaemin Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Akoglu%2C+L">Leman Akoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.02865v1-abstract-short" style="display: inline;"> Time series anomaly detection (TSAD) finds many applications such as monitoring environmental sensors, industry KPIs, patient biomarkers, etc. A two-fold challenge for TSAD is a versatile and unsupervised model that can detect various different types of time series anomalies (spikes, discontinuities, trend shifts, etc.) without any labeled data. Modern neural networks have outstanding ability in m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02865v1-abstract-full').style.display = 'inline'; document.getElementById('2404.02865v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.02865v1-abstract-full" style="display: none;"> Time series anomaly detection (TSAD) finds many applications such as monitoring environmental sensors, industry KPIs, patient biomarkers, etc. A two-fold challenge for TSAD is a versatile and unsupervised model that can detect various different types of time series anomalies (spikes, discontinuities, trend shifts, etc.) without any labeled data. Modern neural networks have outstanding ability in modeling complex time series. Self-supervised models in particular tackle unsupervised TSAD by transforming the input via various augmentations to create pseudo anomalies for training. However, their performance is sensitive to the choice of augmentation, which is hard to choose in practice, while there exists no effort in the literature on data augmentation tuning for TSAD without labels. Our work aims to fill this gap. We introduce TSAP for TSA &#34;on autoPilot&#34;, which can (self-)tune augmentation hyperparameters end-to-end. It stands on two key components: a differentiable augmentation architecture and an unsupervised validation loss to effectively assess the alignment between augmentation type and anomaly type. Case studies show TSAP&#39;s ability to effectively select the (discrete) augmentation type and associated (continuous) hyperparameters. In turn, it outperforms established baselines, including SOTA self-supervised models, on diverse TSAD tasks exhibiting different anomaly types. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02865v1-abstract-full').style.display = 'none'; document.getElementById('2404.02865v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01690">arXiv:2404.01690</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.01690">pdf</a>, <a href="https://arxiv.org/format/2404.01690">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RefQSR: Reference-based Quantization for Image Super-Resolution Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hongjae Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jun-Sang Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Jung%2C+S">Seung-Won Jung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01690v1-abstract-short" style="display: inline;"> Single image super-resolution (SISR) aims to reconstruct a high-resolution image from its low-resolution observation. Recent deep learning-based SISR models show high performance at the expense of increased computational costs, limiting their use in resource-constrained environments. As a promising solution for computationally efficient network design, network quantization has been extensively stu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01690v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01690v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01690v1-abstract-full" style="display: none;"> Single image super-resolution (SISR) aims to reconstruct a high-resolution image from its low-resolution observation. Recent deep learning-based SISR models show high performance at the expense of increased computational costs, limiting their use in resource-constrained environments. As a promising solution for computationally efficient network design, network quantization has been extensively studied. However, existing quantization methods developed for SISR have yet to effectively exploit image self-similarity, which is a new direction for exploration in this study. We introduce a novel method called reference-based quantization for image super-resolution (RefQSR) that applies high-bit quantization to several representative patches and uses them as references for low-bit quantization of the rest of the patches in an image. To this end, we design dedicated patch clustering and reference-based quantization modules and integrate them into existing SISR network quantization methods. The experimental results demonstrate the effectiveness of RefQSR on various SISR networks and quantization methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01690v1-abstract-full').style.display = 'none'; document.getElementById('2404.01690v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Image Processing (TIP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00995">arXiv:2404.00995</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.00995">pdf</a>, <a href="https://arxiv.org/format/2404.00995">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PosterLlama: Bridging Design Ability of Langauge Model to Contents-Aware Layout Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Seol%2C+J">Jaejung Seol</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Seojun Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaejun Yoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00995v3-abstract-short" style="display: inline;"> Visual layout plays a critical role in graphic design fields such as advertising, posters, and web UI design. The recent trend towards content-aware layout generation through generative models has shown promise, yet it often overlooks the semantic intricacies of layout design by treating it as a simple numerical optimization. To bridge this gap, we introduce PosterLlama, a network designed for gen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00995v3-abstract-full').style.display = 'inline'; document.getElementById('2404.00995v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00995v3-abstract-full" style="display: none;"> Visual layout plays a critical role in graphic design fields such as advertising, posters, and web UI design. The recent trend towards content-aware layout generation through generative models has shown promise, yet it often overlooks the semantic intricacies of layout design by treating it as a simple numerical optimization. To bridge this gap, we introduce PosterLlama, a network designed for generating visually and textually coherent layouts by reformatting layout elements into HTML code and leveraging the rich design knowledge embedded within language models. Furthermore, we enhance the robustness of our model with a unique depth-based poster augmentation strategy. This ensures our generated layouts remain semantically rich but also visually appealing, even with limited data. Our extensive evaluations across several benchmarks demonstrate that PosterLlama outperforms existing methods in producing authentic and content-aware layouts. It supports an unparalleled range of conditions, including but not limited to unconditional layout generation, element conditional layout generation, layout completion, among others, serving as a highly versatile user manipulation tool. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00995v3-abstract-full').style.display = 'none'; document.getElementById('2404.00995v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00921">arXiv:2404.00921</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.00921">pdf</a>, <a href="https://arxiv.org/format/2404.00921">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Label-Efficient Human Matting: A Simple Baseline for Weakly Semi-Supervised Trimap-Free Human Matting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+B">Beomyoung Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Yi%2C+M+Y">Myeong Yeon Yi</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Joonsang Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+Y+J">Young Joon Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Hwang%2C+S+J">Sung Ju Hwang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00921v1-abstract-short" style="display: inline;"> This paper presents a new practical training method for human matting, which demands delicate pixel-level human region identification and significantly laborious annotations. To reduce the annotation cost, most existing matting approaches often rely on image synthesis to augment the dataset. However, the unnaturalness of synthesized training images brings in a new domain generalization challenge f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00921v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00921v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00921v1-abstract-full" style="display: none;"> This paper presents a new practical training method for human matting, which demands delicate pixel-level human region identification and significantly laborious annotations. To reduce the annotation cost, most existing matting approaches often rely on image synthesis to augment the dataset. However, the unnaturalness of synthesized training images brings in a new domain generalization challenge for natural images. To address this challenge, we introduce a new learning paradigm, weakly semi-supervised human matting (WSSHM), which leverages a small amount of expensive matte labels and a large amount of budget-friendly segmentation labels, to save the annotation cost and resolve the domain generalization problem. To achieve the goal of WSSHM, we propose a simple and effective training method, named Matte Label Blending (MLB), that selectively guides only the beneficial knowledge of the segmentation and matte data to the matting model. Extensive experiments with our detailed analysis demonstrate our method can substantially improve the robustness of the matting model using a few matte data and numerous segmentation data. Our training method is also easily applicable to real-time models, achieving competitive accuracy with breakneck inference speed (328 FPS on NVIDIA V100 GPU). The implementation code is available at \url{https://github.com/clovaai/WSSHM}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00921v1-abstract-full').style.display = 'none'; document.getElementById('2404.00921v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint, 15 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00638">arXiv:2404.00638</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.00638">pdf</a>, <a href="https://arxiv.org/format/2404.00638">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HypeBoy: Generative Self-Supervised Representation Learning on Hypergraphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sunwoo Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+S">Shinhwan Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Bu%2C+F">Fanchen Bu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+S+Y">Soo Yong Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaemin Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Shin%2C+K">Kijung Shin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00638v1-abstract-short" style="display: inline;"> Hypergraphs are marked by complex topology, expressing higher-order interactions among multiple nodes with hyperedges, and better capturing the topology is essential for effective representation learning. Recent advances in generative self-supervised learning (SSL) suggest that hypergraph neural networks learned from generative self supervision have the potential to effectively encode the complex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00638v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00638v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00638v1-abstract-full" style="display: none;"> Hypergraphs are marked by complex topology, expressing higher-order interactions among multiple nodes with hyperedges, and better capturing the topology is essential for effective representation learning. Recent advances in generative self-supervised learning (SSL) suggest that hypergraph neural networks learned from generative self supervision have the potential to effectively encode the complex hypergraph topology. Designing a generative SSL strategy for hypergraphs, however, is not straightforward. Questions remain with regard to its generative SSL task, connection to downstream tasks, and empirical properties of learned representations. In light of the promises and challenges, we propose a novel generative SSL strategy for hypergraphs. We first formulate a generative SSL task on hypergraphs, hyperedge filling, and highlight its theoretical connection to node classification. Based on the generative SSL task, we propose a hypergraph SSL method, HypeBoy. HypeBoy learns effective general-purpose hypergraph representations, outperforming 16 baseline methods across 11 benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00638v1-abstract-full').style.display = 'none'; document.getElementById('2404.00638v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19724">arXiv:2403.19724</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.19724">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Towards Reverse-Engineering the Brain: Brain-Derived Neuromorphic Computing Approach with Photonic, Electronic, and Ionic Dynamicity in 3D integrated circuits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+S+J+B">S. J. Ben Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=El-Srouji%2C+L">Luis El-Srouji</a>, <a href="/search/cs?searchtype=author&amp;query=Datta%2C+S">Suman Datta</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+S">Shimeng Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Incorvia%2C+J+A">Jean Anne Incorvia</a>, <a href="/search/cs?searchtype=author&amp;query=Salleo%2C+A">Alberto Salleo</a>, <a href="/search/cs?searchtype=author&amp;query=Sorger%2C+V">Volker Sorger</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J">Juejun Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Kimerling%2C+L+C">Lionel C Kimerling</a>, <a href="/search/cs?searchtype=author&amp;query=Bouchard%2C+K">Kristofer Bouchard</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+J">Joy Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhuri%2C+R">Rishidev Chaudhuri</a>, <a href="/search/cs?searchtype=author&amp;query=Ranganath%2C+C">Charan Ranganath</a>, <a href="/search/cs?searchtype=author&amp;query=O%27Reilly%2C+R">Randall O&#39;Reilly</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19724v1-abstract-short" style="display: inline;"> The human brain has immense learning capabilities at extreme energy efficiencies and scale that no artificial system has been able to match. For decades, reverse engineering the brain has been one of the top priorities of science and technology research. Despite numerous efforts, conventional electronics-based methods have failed to match the scalability, energy efficiency, and self-supervised lea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19724v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19724v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19724v1-abstract-full" style="display: none;"> The human brain has immense learning capabilities at extreme energy efficiencies and scale that no artificial system has been able to match. For decades, reverse engineering the brain has been one of the top priorities of science and technology research. Despite numerous efforts, conventional electronics-based methods have failed to match the scalability, energy efficiency, and self-supervised learning capabilities of the human brain. On the other hand, very recent progress in the development of new generations of photonic and electronic memristive materials, device technologies, and 3D electronic-photonic integrated circuits (3D EPIC ) promise to realize new brain-derived neuromorphic systems with comparable connectivity, density, energy-efficiency, and scalability. When combined with bio-realistic learning algorithms and architectures, it may be possible to realize an &#39;artificial brain&#39; prototype with general self-learning capabilities. This paper argues the possibility of reverse-engineering the brain through architecting a prototype of a brain-derived neuromorphic computing system consisting of artificial electronic, ionic, photonic materials, devices, and circuits with dynamicity resembling the bio-plausible molecular, neuro/synaptic, neuro-circuit, and multi-structural hierarchical macro-circuits of the brain based on well-tested computational models. We further argue the importance of bio-plausible local learning algorithms applicable to the neuromorphic computing system that capture the flexible and adaptive unsupervised and self-supervised learning mechanisms central to human intelligence. Most importantly, we emphasize that the unique capabilities in brain-derived neuromorphic computing prototype systems will enable us to understand links between specific neuronal and network-level properties with system-level functioning and behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19724v1-abstract-full').style.display = 'none'; document.getElementById('2403.19724v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15227">arXiv:2403.15227</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.15227">pdf</a>, <a href="https://arxiv.org/format/2403.15227">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> LeGO: Leveraging a Surface Deformation Network for Animatable Stylized Face Generation with One Example </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoon%2C+S">Soyeon Yoon</a>, <a href="/search/cs?searchtype=author&amp;query=Yun%2C+K">Kwan Yun</a>, <a href="/search/cs?searchtype=author&amp;query=Seo%2C+K">Kwanggyoon Seo</a>, <a href="/search/cs?searchtype=author&amp;query=Cha%2C+S">Sihun Cha</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J+E">Jung Eun Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Noh%2C+J">Junyong Noh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15227v1-abstract-short" style="display: inline;"> Recent advances in 3D face stylization have made significant strides in few to zero-shot settings. However, the degree of stylization achieved by existing methods is often not sufficient for practical applications because they are mostly based on statistical 3D Morphable Models (3DMM) with limited variations. To this end, we propose a method that can produce a highly stylized 3D face model with de&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15227v1-abstract-full').style.display = 'inline'; document.getElementById('2403.15227v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15227v1-abstract-full" style="display: none;"> Recent advances in 3D face stylization have made significant strides in few to zero-shot settings. However, the degree of stylization achieved by existing methods is often not sufficient for practical applications because they are mostly based on statistical 3D Morphable Models (3DMM) with limited variations. To this end, we propose a method that can produce a highly stylized 3D face model with desired topology. Our methods train a surface deformation network with 3DMM and translate its domain to the target style using a paired exemplar. The network achieves stylization of the 3D face mesh by mimicking the style of the target using a differentiable renderer and directional CLIP losses. Additionally, during the inference process, we utilize a Mesh Agnostic Encoder (MAGE) that takes deformation target, a mesh of diverse topologies as input to the stylization process and encodes its shape into our latent space. The resulting stylized face model can be animated by commonly used 3DMM blend shapes. A set of quantitative and qualitative evaluations demonstrate that our method can produce highly stylized face meshes according to a given style and output them in a desired topology. We also demonstrate example applications of our method including image-based stylized avatar generation, linear interpolation of geometric styles, and facial animation of stylized avatars. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15227v1-abstract-full').style.display = 'none'; document.getElementById('2403.15227v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.9 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10906">arXiv:2403.10906</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.10906">pdf</a>, <a href="https://arxiv.org/format/2403.10906">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HourglassNeRF: Casting an Hourglass as a Bundle of Rays for Few-shot Neural Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Seo%2C+S">Seunghyeon Seo</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+Y">Yeonjin Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jayeon Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+S">Seungwoo Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hojun Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kwak%2C+N">Nojun Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10906v1-abstract-short" style="display: inline;"> Recent advancements in the Neural Radiance Field (NeRF) have bolstered its capabilities for novel view synthesis, yet its reliance on dense multi-view training images poses a practical challenge. Addressing this, we propose HourglassNeRF, an effective regularization-based approach with a novel hourglass casting strategy. Our proposed hourglass is conceptualized as a bundle of additional rays withi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10906v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10906v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10906v1-abstract-full" style="display: none;"> Recent advancements in the Neural Radiance Field (NeRF) have bolstered its capabilities for novel view synthesis, yet its reliance on dense multi-view training images poses a practical challenge. Addressing this, we propose HourglassNeRF, an effective regularization-based approach with a novel hourglass casting strategy. Our proposed hourglass is conceptualized as a bundle of additional rays within the area between the original input ray and its corresponding reflection ray, by featurizing the conical frustum via Integrated Positional Encoding (IPE). This design expands the coverage of unseen views and enables an adaptive high-frequency regularization based on target pixel photo-consistency. Furthermore, we propose luminance consistency regularization based on the Lambertian assumption, which is known to be effective for training a set of augmented rays under the few-shot setting. Leveraging the inherent property of a Lambertian surface, which retains consistent luminance irrespective of the viewing angle, we assume our proposed hourglass as a collection of flipped diffuse reflection rays and enhance the luminance consistency between the original input ray and its corresponding hourglass, resulting in more physically grounded training framework and performance improvement. Our HourglassNeRF outperforms its baseline and achieves competitive results on multiple benchmarks with sharply rendered fine details. The code will be available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10906v1-abstract-full').style.display = 'none'; document.getElementById('2403.10906v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.09675">arXiv:2403.09675</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.09675">pdf</a>, <a href="https://arxiv.org/format/2403.09675">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Open-Universe Indoor Scene Generation using LLM Program Synthesis and Uncurated Object Databases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aguina-Kang%2C+R">Rio Aguina-Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Gumin%2C+M">Maxim Gumin</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+D+H">Do Heon Han</a>, <a href="/search/cs?searchtype=author&amp;query=Morris%2C+S">Stewart Morris</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+S+J">Seung Jean Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Ganeshan%2C+A">Aditya Ganeshan</a>, <a href="/search/cs?searchtype=author&amp;query=Jones%2C+R+K">R. Kenny Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Q+A">Qiuhong Anna Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+K">Kailiang Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Ritchie%2C+D">Daniel Ritchie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.09675v1-abstract-short" style="display: inline;"> We present a system for generating indoor scenes in response to text prompts. The prompts are not limited to a fixed vocabulary of scene descriptions, and the objects in generated scenes are not restricted to a fixed set of object categories -- we call this setting indoor scene generation. Unlike most prior work on indoor scene generation, our system does not require a large training dataset of ex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09675v1-abstract-full').style.display = 'inline'; document.getElementById('2403.09675v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.09675v1-abstract-full" style="display: none;"> We present a system for generating indoor scenes in response to text prompts. The prompts are not limited to a fixed vocabulary of scene descriptions, and the objects in generated scenes are not restricted to a fixed set of object categories -- we call this setting indoor scene generation. Unlike most prior work on indoor scene generation, our system does not require a large training dataset of existing 3D scenes. Instead, it leverages the world knowledge encoded in pre-trained large language models (LLMs) to synthesize programs in a domain-specific layout language that describe objects and spatial relations between them. Executing such a program produces a specification of a constraint satisfaction problem, which the system solves using a gradient-based optimization scheme to produce object positions and orientations. To produce object geometry, the system retrieves 3D meshes from a database. Unlike prior work which uses databases of category-annotated, mutually-aligned meshes, we develop a pipeline using vision-language models (VLMs) to retrieve meshes from massive databases of un-annotated, inconsistently-aligned meshes. Experimental evaluations show that our system outperforms generative models trained on 3D data for traditional, closed-universe scene generation tasks; it also outperforms a recent LLM-based layout generation method on open-universe scene generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09675v1-abstract-full').style.display = 'none'; document.getElementById('2403.09675v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">See ancillary files for link to supplemental material</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.09669">arXiv:2403.09669</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.09669">pdf</a>, <a href="https://arxiv.org/format/2403.09669">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> STREAM: Spatio-TempoRal Evaluation and Analysis Metric for Video Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+P+J">Pum Jun Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Seojun Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaejun Yoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.09669v3-abstract-short" style="display: inline;"> Image generative models have made significant progress in generating realistic and diverse images, supported by comprehensive guidance from various evaluation metrics. However, current video generative models struggle to generate even short video clips, with limited tools that provide insights for improvements. Current video evaluation metrics are simple adaptations of image metrics by switching t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09669v3-abstract-full').style.display = 'inline'; document.getElementById('2403.09669v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.09669v3-abstract-full" style="display: none;"> Image generative models have made significant progress in generating realistic and diverse images, supported by comprehensive guidance from various evaluation metrics. However, current video generative models struggle to generate even short video clips, with limited tools that provide insights for improvements. Current video evaluation metrics are simple adaptations of image metrics by switching the embeddings with video embedding networks, which may underestimate the unique characteristics of video. Our analysis reveals that the widely used Frechet Video Distance (FVD) has a stronger emphasis on the spatial aspect than the temporal naturalness of video and is inherently constrained by the input size of the embedding networks used, limiting it to 16 frames. Additionally, it demonstrates considerable instability and diverges from human evaluations. To address the limitations, we propose STREAM, a new video evaluation metric uniquely designed to independently evaluate spatial and temporal aspects. This feature allows comprehensive analysis and evaluation of video generative models from various perspectives, unconstrained by video length. We provide analytical and experimental evidence demonstrating that STREAM provides an effective evaluation tool for both visual and temporal quality of videos, offering insights into area of improvement for video generative models. To the best of our knowledge, STREAM is the first evaluation metric that can separately assess the temporal and spatial aspects of videos. Our code is available at https://github.com/pro2nit/STREAM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09669v3-abstract-full').style.display = 'none'; document.getElementById('2403.09669v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our work is accepted to ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.01663">arXiv:2403.01663</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.01663">pdf</a>, <a href="https://arxiv.org/format/2403.01663">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PillarGen: Enhancing Radar Point Cloud Density and Quality via Pillar-based Point Generation Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J">Jisong Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Bang%2C+G">Geonho Bang</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+K">Kwangjin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Seong%2C+M">Minjae Seong</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaechang Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Pyo%2C+E">Eunjong Pyo</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+J+W">Jun Won Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.01663v2-abstract-short" style="display: inline;"> In this paper, we present a novel point generation model, referred to as Pillar-based Point Generation Network (PillarGen), which facilitates the transformation of point clouds from one domain into another. PillarGen can produce synthetic point clouds with enhanced density and quality based on the provided input point clouds. The PillarGen model performs the following three steps: 1) pillar encodi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01663v2-abstract-full').style.display = 'inline'; document.getElementById('2403.01663v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.01663v2-abstract-full" style="display: none;"> In this paper, we present a novel point generation model, referred to as Pillar-based Point Generation Network (PillarGen), which facilitates the transformation of point clouds from one domain into another. PillarGen can produce synthetic point clouds with enhanced density and quality based on the provided input point clouds. The PillarGen model performs the following three steps: 1) pillar encoding, 2) Occupied Pillar Prediction (OPP), and 3) Pillar to Point Generation (PPG). The input point clouds are encoded using a pillar grid structure to generate pillar features. Then, OPP determines the active pillars used for point generation and predicts the center of points and the number of points to be generated for each active pillar. PPG generates the synthetic points for each active pillar based on the information provided by OPP. We evaluate the performance of PillarGen using our proprietary radar dataset, focusing on enhancing the density and quality of short-range radar data using the long-range radar data as supervision. Our experiments demonstrate that PillarGen outperforms traditional point upsampling methods in quantitative and qualitative measures. We also confirm that when PillarGen is incorporated into bird&#39;s eye view object detection, a significant improvement in detection accuracy is achieved. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01663v2-abstract-full').style.display = 'none'; document.getElementById('2403.01663v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE International Conference on Robotics and Automation (ICRA 2024), 8 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.01344">arXiv:2403.01344</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.01344">pdf</a>, <a href="https://arxiv.org/format/2403.01344">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mitigating the Bias in the Model for Continual Test-Time Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chung%2C+I">Inseop Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Hwang%2C+K">Kyomin Hwang</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jayeon Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Kwak%2C+N">Nojun Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.01344v1-abstract-short" style="display: inline;"> Continual Test-Time Adaptation (CTA) is a challenging task that aims to adapt a source pre-trained model to continually changing target domains. In the CTA setting, a model does not know when the target domain changes, thus facing a drastic change in the distribution of streaming inputs during the test-time. The key challenge is to keep adapting the model to the continually changing target domains&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01344v1-abstract-full').style.display = 'inline'; document.getElementById('2403.01344v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.01344v1-abstract-full" style="display: none;"> Continual Test-Time Adaptation (CTA) is a challenging task that aims to adapt a source pre-trained model to continually changing target domains. In the CTA setting, a model does not know when the target domain changes, thus facing a drastic change in the distribution of streaming inputs during the test-time. The key challenge is to keep adapting the model to the continually changing target domains in an online manner. We find that a model shows highly biased predictions as it constantly adapts to the chaining distribution of the target data. It predicts certain classes more often than other classes, making inaccurate over-confident predictions. This paper mitigates this issue to improve performance in the CTA scenario. To alleviate the bias issue, we make class-wise exponential moving average target prototypes with reliable target samples and exploit them to cluster the target features class-wisely. Moreover, we aim to align the target distributions to the source distribution by anchoring the target feature to its corresponding source prototype. With extensive experiments, our proposed method achieves noteworthy performance gain when applied on top of existing CTA methods without substantial adaptation time overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01344v1-abstract-full').style.display = 'none'; document.getElementById('2403.01344v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.13729">arXiv:2402.13729</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.13729">pdf</a>, <a href="https://arxiv.org/format/2402.13729">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+K">Kihong Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Haneol Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+J">Jihye Park</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Seyeon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+K">Kwanghee Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Seungryong Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaejun Yoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.13729v4-abstract-short" style="display: inline;"> Generating high-quality videos that synthesize desired realistic content is a challenging task due to their intricate high-dimensionality and complexity of videos. Several recent diffusion-based methods have shown comparable performance by compressing videos to a lower-dimensional latent space, using traditional video autoencoder architecture. However, such method that employ standard frame-wise 2&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13729v4-abstract-full').style.display = 'inline'; document.getElementById('2402.13729v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.13729v4-abstract-full" style="display: none;"> Generating high-quality videos that synthesize desired realistic content is a challenging task due to their intricate high-dimensionality and complexity of videos. Several recent diffusion-based methods have shown comparable performance by compressing videos to a lower-dimensional latent space, using traditional video autoencoder architecture. However, such method that employ standard frame-wise 2D and 3D convolution fail to fully exploit the spatio-temporal nature of videos. To address this issue, we propose a novel hybrid video diffusion model, called HVDM, which can capture spatio-temporal dependencies more effectively. The HVDM is trained by a hybrid video autoencoder which extracts a disentangled representation of the video including: (i) a global context information captured by a 2D projected latent (ii) a local volume information captured by 3D convolutions with wavelet decomposition (iii) a frequency information for improving the video reconstruction. Based on this disentangled representation, our hybrid autoencoder provide a more comprehensive video latent enriching the generated videos with fine structures and details. Experiments on video generation benchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed approach achieves state-of-the-art video generation quality, showing a wide range of video applications (e.g., long video generation, image-to-video, and video dynamics control). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13729v4-abstract-full').style.display = 'none'; document.getElementById('2402.13729v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page is available at https://hxngiee.github.io/HVDM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.11201">arXiv:2402.11201</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.11201">pdf</a>, <a href="https://arxiv.org/format/2402.11201">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Decoding Scheme with Successive Aggregation of Multi-Level Features for Light-Weight Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jiwon Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">Jangwon Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+G">Gyeonghwan Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.11201v2-abstract-short" style="display: inline;"> Multi-scale architecture, including hierarchical vision transformer, has been commonly applied to high-resolution semantic segmentation to deal with computational complexity with minimum performance loss. In this paper, we propose a novel decoding scheme for semantic segmentation in this regard, which takes multi-level features from the encoder with multi-scale architecture. The decoding scheme ba&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11201v2-abstract-full').style.display = 'inline'; document.getElementById('2402.11201v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.11201v2-abstract-full" style="display: none;"> Multi-scale architecture, including hierarchical vision transformer, has been commonly applied to high-resolution semantic segmentation to deal with computational complexity with minimum performance loss. In this paper, we propose a novel decoding scheme for semantic segmentation in this regard, which takes multi-level features from the encoder with multi-scale architecture. The decoding scheme based on a multi-level vision transformer aims to achieve not only reduced computational expense but also higher segmentation accuracy, by introducing successive cross-attention in aggregation of the multi-level features. Furthermore, a way to enhance the multi-level features by the aggregated semantics is proposed. The effort is focused on maintaining the contextual consistency from the perspective of attention allocation and brings improved performance with significantly lower computational cost. Set of experiments on popular datasets demonstrates superiority of the proposed scheme to the state-of-the-art semantic segmentation models in terms of computational cost without loss of accuracy, and extensive ablation studies prove the effectiveness of ideas proposed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11201v2-abstract-full').style.display = 'none'; document.getElementById('2402.11201v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures, ICIP2024 Accepted paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.09542">arXiv:2402.09542</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.09542">pdf</a>, <a href="https://arxiv.org/format/2402.09542">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Layerwise Proximal Replay: A Proximal Point Method for Online Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jason Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yunpeng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wood%2C+F">Frank Wood</a>, <a href="/search/cs?searchtype=author&amp;query=Pleiss%2C+G">Geoff Pleiss</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.09542v3-abstract-short" style="display: inline;"> In online continual learning, a neural network incrementally learns from a non-i.i.d. data stream. Nearly all online continual learning methods employ experience replay to simultaneously prevent catastrophic forgetting and underfitting on past data. Our work demonstrates a limitation of this approach: neural networks trained with experience replay tend to have unstable optimization trajectories, i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09542v3-abstract-full').style.display = 'inline'; document.getElementById('2402.09542v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.09542v3-abstract-full" style="display: none;"> In online continual learning, a neural network incrementally learns from a non-i.i.d. data stream. Nearly all online continual learning methods employ experience replay to simultaneously prevent catastrophic forgetting and underfitting on past data. Our work demonstrates a limitation of this approach: neural networks trained with experience replay tend to have unstable optimization trajectories, impeding their overall accuracy. Surprisingly, these instabilities persist even when the replay buffer stores all previous training examples, suggesting that this issue is orthogonal to catastrophic forgetting. We minimize these instabilities through a simple modification of the optimization geometry. Our solution, Layerwise Proximal Replay (LPR), balances learning from new and replay data while only allowing for gradual changes in the hidden activation of past data. We demonstrate that LPR consistently improves replay-based online continual learning methods across multiple problem settings, regardless of the amount of available replay memory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09542v3-abstract-full').style.display = 'none'; document.getElementById('2402.09542v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.08138">arXiv:2402.08138</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.08138">pdf</a>, <a href="https://arxiv.org/format/2402.08138">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> H2O-SDF: Two-phase Learning for 3D Indoor Reconstruction using Object Surface Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Park%2C+M">Minyoung Park</a>, <a href="/search/cs?searchtype=author&amp;query=Do%2C+M">Mirae Do</a>, <a href="/search/cs?searchtype=author&amp;query=Shin%2C+Y">YeonJae Shin</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaeseok Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+J">Jongkwang Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J">Joongrock Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+C">Chul Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.08138v2-abstract-short" style="display: inline;"> Advanced techniques using Neural Radiance Fields (NeRF), Signed Distance Fields (SDF), and Occupancy Fields have recently emerged as solutions for 3D indoor scene reconstruction. We introduce a novel two-phase learning approach, H2O-SDF, that discriminates between object and non-object regions within indoor environments. This method achieves a nuanced balance, carefully preserving the geometric in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08138v2-abstract-full').style.display = 'inline'; document.getElementById('2402.08138v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.08138v2-abstract-full" style="display: none;"> Advanced techniques using Neural Radiance Fields (NeRF), Signed Distance Fields (SDF), and Occupancy Fields have recently emerged as solutions for 3D indoor scene reconstruction. We introduce a novel two-phase learning approach, H2O-SDF, that discriminates between object and non-object regions within indoor environments. This method achieves a nuanced balance, carefully preserving the geometric integrity of room layouts while also capturing intricate surface details of specific objects. A cornerstone of our two-phase learning framework is the introduction of the Object Surface Field (OSF), a novel concept designed to mitigate the persistent vanishing gradient problem that has previously hindered the capture of high-frequency details in other methods. Our proposed approach is validated through several experiments that include ablation studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08138v2-abstract-full').style.display = 'none'; document.getElementById('2402.08138v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.04621">arXiv:2402.04621</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.04621">pdf</a>, <a href="https://arxiv.org/format/2402.04621">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Feature Distribution on Graph Topology Mediates the Effect of Graph Convolution: Homophily Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+S+Y">Soo Yong Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sunwoo Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Bu%2C+F">Fanchen Bu</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaemin Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jiliang Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Shin%2C+K">Kijung Shin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.04621v2-abstract-short" style="display: inline;"> How would randomly shuffling feature vectors among nodes from the same class affect graph neural networks (GNNs)? The feature shuffle, intuitively, perturbs the dependence between graph topology and features (A-X dependence) for GNNs to learn from. Surprisingly, we observe a consistent and significant improvement in GNN performance following the feature shuffle. Having overlooked the impact of A-X&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04621v2-abstract-full').style.display = 'inline'; document.getElementById('2402.04621v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.04621v2-abstract-full" style="display: none;"> How would randomly shuffling feature vectors among nodes from the same class affect graph neural networks (GNNs)? The feature shuffle, intuitively, perturbs the dependence between graph topology and features (A-X dependence) for GNNs to learn from. Surprisingly, we observe a consistent and significant improvement in GNN performance following the feature shuffle. Having overlooked the impact of A-X dependence on GNNs, the prior literature does not provide a satisfactory understanding of the phenomenon. Thus, we raise two research questions. First, how should A-X dependence be measured, while controlling for potential confounds? Second, how does A-X dependence affect GNNs? In response, we (i) propose a principled measure for A-X dependence, (ii) design a random graph model that controls A-X dependence, (iii) establish a theory on how A-X dependence relates to graph convolution, and (iv) present empirical analysis on real-world graphs that align with the theory. We conclude that A-X dependence mediates the effect of graph convolution, such that smaller dependence improves GNN-based node classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04621v2-abstract-full').style.display = 'none'; document.getElementById('2402.04621v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">published in ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.15944">arXiv:2401.15944</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.15944">pdf</a>, <a href="https://arxiv.org/format/2401.15944">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LGRS.2023.3336680">10.1109/LGRS.2023.3336680 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Bridging the Domain Gap: A Simple Domain Matching Method for Reference-based Image Super-Resolution in Remote Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Min%2C+J">Jeongho Min</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+Y">Yejun Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+D">Dongyoung Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaejun Yoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.15944v1-abstract-short" style="display: inline;"> Recently, reference-based image super-resolution (RefSR) has shown excellent performance in image super-resolution (SR) tasks. The main idea of RefSR is to utilize additional information from the reference (Ref) image to recover the high-frequency components in low-resolution (LR) images. By transferring relevant textures through feature matching, RefSR models outperform existing single image supe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15944v1-abstract-full').style.display = 'inline'; document.getElementById('2401.15944v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.15944v1-abstract-full" style="display: none;"> Recently, reference-based image super-resolution (RefSR) has shown excellent performance in image super-resolution (SR) tasks. The main idea of RefSR is to utilize additional information from the reference (Ref) image to recover the high-frequency components in low-resolution (LR) images. By transferring relevant textures through feature matching, RefSR models outperform existing single image super-resolution (SISR) models. However, their performance significantly declines when a domain gap between Ref and LR images exists, which often occurs in real-world scenarios, such as satellite imaging. In this letter, we introduce a Domain Matching (DM) module that can be seamlessly integrated with existing RefSR models to enhance their performance in a plug-and-play manner. To the best of our knowledge, we are the first to explore Domain Matching-based RefSR in remote sensing image processing. Our analysis reveals that their domain gaps often occur in different satellites, and our model effectively addresses these challenges, whereas existing models struggle. Our experiments demonstrate that the proposed DM module improves SR performance both qualitatively and quantitatively for remote sensing super-resolution tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15944v1-abstract-full').style.display = 'none'; document.getElementById('2401.15944v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE GRSL 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> Article Sequence Number: 8000105, Print ISSN: 1545-598X </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Volume: 21, Year: 2023, Page: 1-5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.08875">arXiv:2312.08875</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.08875">pdf</a>, <a href="https://arxiv.org/format/2312.08875">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> What, How, and When Should Object Detectors Update in Continually Changing Test Domains? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jayeon Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+D">Dongkwan Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Chung%2C+I">Inseop Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+D">Donghyun Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Kwak%2C+N">Nojun Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.08875v1-abstract-short" style="display: inline;"> It is a well-known fact that the performance of deep learning models deteriorates when they encounter a distribution shift at test time. Test-time adaptation (TTA) algorithms have been proposed to adapt the model online while inferring test data. However, existing research predominantly focuses on classification tasks through the optimization of batch normalization layers or classification heads,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08875v1-abstract-full').style.display = 'inline'; document.getElementById('2312.08875v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.08875v1-abstract-full" style="display: none;"> It is a well-known fact that the performance of deep learning models deteriorates when they encounter a distribution shift at test time. Test-time adaptation (TTA) algorithms have been proposed to adapt the model online while inferring test data. However, existing research predominantly focuses on classification tasks through the optimization of batch normalization layers or classification heads, but this approach limits its applicability to various model architectures like Transformers and makes it challenging to apply to other tasks, such as object detection. In this paper, we propose a novel online adaption approach for object detection in continually changing test domains, considering which part of the model to update, how to update it, and when to perform the update. By introducing architecture-agnostic and lightweight adaptor modules and only updating these while leaving the pre-trained backbone unchanged, we can rapidly adapt to new test domains in an efficient way and prevent catastrophic forgetting. Furthermore, we present a practical and straightforward class-wise feature aligning method for object detection to resolve domain shifts. Additionally, we enhance efficiency by determining when the model is sufficiently adapted or when additional adaptation is needed due to changes in the test distribution. Our approach surpasses baselines on widely used benchmarks, achieving improvements of up to 4.9\%p and 7.9\%p in mAP for COCO $\rightarrow$ COCO-corrupted and SHIFT, respectively, while maintaining about 20 FPS or higher. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08875v1-abstract-full').style.display = 'none'; document.getElementById('2312.08875v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.07266">arXiv:2312.07266</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.07266">pdf</a>, <a href="https://arxiv.org/format/2312.07266">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ProxyDet: Synthesizing Proxy Novel Classes via Classwise Mixup for Open-Vocabulary Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jeong%2C+J">Joonhyun Jeong</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+G">Geondo Park</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jayeon Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Jung%2C+H">Hyungsik Jung</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+H">Heesu Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.07266v4-abstract-short" style="display: inline;"> Open-vocabulary object detection (OVOD) aims to recognize novel objects whose categories are not included in the training set. In order to classify these unseen classes during training, many OVOD frameworks leverage the zero-shot capability of largely pretrained vision and language models, such as CLIP. To further improve generalization on the unseen novel classes, several approaches proposed to a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.07266v4-abstract-full').style.display = 'inline'; document.getElementById('2312.07266v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.07266v4-abstract-full" style="display: none;"> Open-vocabulary object detection (OVOD) aims to recognize novel objects whose categories are not included in the training set. In order to classify these unseen classes during training, many OVOD frameworks leverage the zero-shot capability of largely pretrained vision and language models, such as CLIP. To further improve generalization on the unseen novel classes, several approaches proposed to additionally train with pseudo region labeling on the external data sources that contain a substantial number of novel category labels beyond the existing training data. Albeit its simplicity, these pseudo-labeling methods still exhibit limited improvement with regard to the truly unseen novel classes that were not pseudo-labeled. In this paper, we present a novel, yet simple technique that helps generalization on the overall distribution of novel classes. Inspired by our observation that numerous novel classes reside within the convex hull constructed by the base (seen) classes in the CLIP embedding space, we propose to synthesize proxy-novel classes approximating novel classes via linear mixup between a pair of base classes. By training our detector with these synthetic proxy-novel classes, we effectively explore the embedding space of novel classes. The experimental results on various OVOD benchmarks such as LVIS and COCO demonstrate superior performance on novel classes compared to the other state-of-the-art methods. Code is available at https://github.com/clovaai/ProxyDet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.07266v4-abstract-full').style.display = 'none'; document.getElementById('2312.07266v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in AAAI24. Code: https://github.com/clovaai/ProxyDet Project page: https://proxydet.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.14496">arXiv:2311.14496</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.14496">pdf</a>, <a href="https://arxiv.org/format/2311.14496">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> RTPS Attack Dataset Description </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+D+Y">Dong Young Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+D">Dongsung Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yuchan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+G+M">Gang Min Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+M+G">Min Geun Song</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J+D">Jeong Do Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+H+K">Huy Kang Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.14496v4-abstract-short" style="display: inline;"> This paper explains all about our RTPS datasets. We collect malicious/benign packet data by injecting attack data in an Unmanned Ground Vehicle (UGV) in the normal state. We assembled the testbed, consisting of UGV, Controller, PC, and Router. We collect this dataset in the UGV part of our testbed. We conducted two types of attack &#34;Command Injection&#34; and &#34;Command Injection with ARP Spoofing&#34; on&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14496v4-abstract-full').style.display = 'inline'; document.getElementById('2311.14496v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.14496v4-abstract-full" style="display: none;"> This paper explains all about our RTPS datasets. We collect malicious/benign packet data by injecting attack data in an Unmanned Ground Vehicle (UGV) in the normal state. We assembled the testbed, consisting of UGV, Controller, PC, and Router. We collect this dataset in the UGV part of our testbed. We conducted two types of attack &#34;Command Injection&#34; and &#34;Command Injection with ARP Spoofing&#34; on our testbed. The data collection time is 180, 300, 600, and 1200. The scenario has 30 each on collection time, 240 total. We expect this dataset to contribute to the development of defense technologies like anomaly detection to address security threat issues in ROS2 networks and Fast-DDS implements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14496v4-abstract-full').style.display = 'none'; document.getElementById('2311.14496v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This manuscript is written in Korean. You can download our dataset through our lab: https://ocslab.hksecurity.net/Datasets/rtps-attack-dataset We welcome your comments or feedback. Contact INFO: Dong Young Kim (klgh1256@korea.ac.kr), Huy Kang Kim (cenda@korea.ac.kr)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.14342">arXiv:2311.14342</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.14342">pdf</a>, <a href="https://arxiv.org/format/2311.14342">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> AI-based Attack Graph Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Park%2C+S">Sangbeom Park</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">Jaesung Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J+D">Jeong Do Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+M+G">Min Geun Song</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hyosun Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+J">Jaewoong Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Sagong%2C+C">Chaeyeon Sagong</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+H+K">Huy Kang Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.14342v2-abstract-short" style="display: inline;"> With the advancement of IoT technology, many electronic devices are interconnected through networks, communicating with each other and performing specific roles. However, as numerous devices join networks, the threat of cyberattacks also escalates. Preventing and detecting cyber threats are crucial, and one method of preventing such threats involves using attack graphs. Attack graphs are widely us&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14342v2-abstract-full').style.display = 'inline'; document.getElementById('2311.14342v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.14342v2-abstract-full" style="display: none;"> With the advancement of IoT technology, many electronic devices are interconnected through networks, communicating with each other and performing specific roles. However, as numerous devices join networks, the threat of cyberattacks also escalates. Preventing and detecting cyber threats are crucial, and one method of preventing such threats involves using attack graphs. Attack graphs are widely used to assess security threats within networks. However, a drawback emerges as the network scales, as generating attack graphs becomes time-consuming. To overcome this limitation, artificial intelligence models can be employed. By utilizing AI models, attack graphs can be created within a short period, approximating optimal outcomes. AI models designed for attack graph generation consist of encoders and decoders, trained using reinforcement learning algorithms. After training the AI models, we confirmed the model&#39;s learning effectiveness by observing changes in loss and reward values. Additionally, we compared attack graphs generated by the AI model with those created through conventional methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14342v2-abstract-full').style.display = 'none'; document.getElementById('2311.14342v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">in Korean Language, 8 Figures, 14 Pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.14327">arXiv:2311.14327</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.14327">pdf</a>, <a href="https://arxiv.org/format/2311.14327">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> C-ITS Environment Modeling and Attack Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Choi%2C+J">Jaewoong Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+M+G">Min Geun Song</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hyosun Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Sagong%2C+C">Chaeyeon Sagong</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+S">Sangbeom Park</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">Jaesung Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J+D">Jeong Do Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+H+K">Huy Kang Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.14327v2-abstract-short" style="display: inline;"> As technology advances, cities are evolving into smart cities, with the ability to process large amounts of data and the increasing complexity and diversification of various elements within urban areas. Among the core systems of a smart city is the Cooperative-Intelligent Transport Systems (C-ITS). C-ITS is a system where vehicles provide real-time information to drivers about surrounding traffic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14327v2-abstract-full').style.display = 'inline'; document.getElementById('2311.14327v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.14327v2-abstract-full" style="display: none;"> As technology advances, cities are evolving into smart cities, with the ability to process large amounts of data and the increasing complexity and diversification of various elements within urban areas. Among the core systems of a smart city is the Cooperative-Intelligent Transport Systems (C-ITS). C-ITS is a system where vehicles provide real-time information to drivers about surrounding traffic conditions, sudden stops, falling objects, and other accident risks through roadside base stations. It consists of road infrastructure, C-ITS centers, and vehicle terminals. However, as smart cities integrate many elements through networks and electronic control, they are susceptible to cybersecurity issues. In the case of cybersecurity problems in C-ITS, there is a significant risk of safety issues arising. This technical document aims to model the C-ITS environment and the services it provides, with the purpose of identifying the attack surface where security incidents could occur in a smart city environment. Subsequently, based on the identified attack surface, the document aims to construct attack scenarios and their respective stages. The document provides a description of the concept of C-ITS, followed by the description of the C-ITS environment model, service model, and attack scenario model defined by us. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14327v2-abstract-full').style.display = 'none'; document.getElementById('2311.14327v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">in Korean Language, 14 Figures, 15 Pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.08745">arXiv:2310.08745</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.08745">pdf</a>, <a href="https://arxiv.org/format/2310.08745">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AcTExplore: Active Tactile Exploration of Unknown Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shahidzadeh%2C+A">Amir-Hossein Shahidzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+S+J">Seong Jong Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Mantripragada%2C+P">Pavan Mantripragada</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+C+D">Chahat Deep Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Ferm%C3%BCller%2C+C">Cornelia Ferm眉ller</a>, <a href="/search/cs?searchtype=author&amp;query=Aloimonos%2C+Y">Yiannis Aloimonos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.08745v3-abstract-short" style="display: inline;"> Tactile exploration plays a crucial role in understanding object structures for fundamental robotics tasks such as grasping and manipulation. However, efficiently exploring such objects using tactile sensors is challenging, primarily due to the large-scale unknown environments and limited sensing coverage of these sensors. To this end, we present AcTExplore, an active tactile exploration method dr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08745v3-abstract-full').style.display = 'inline'; document.getElementById('2310.08745v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.08745v3-abstract-full" style="display: none;"> Tactile exploration plays a crucial role in understanding object structures for fundamental robotics tasks such as grasping and manipulation. However, efficiently exploring such objects using tactile sensors is challenging, primarily due to the large-scale unknown environments and limited sensing coverage of these sensors. To this end, we present AcTExplore, an active tactile exploration method driven by reinforcement learning for object reconstruction at scales that automatically explores the object surfaces in a limited number of steps. Through sufficient exploration, our algorithm incrementally collects tactile data and reconstructs 3D shapes of the objects as well, which can serve as a representation for higher-level downstream tasks. Our method achieves an average of 95.97% IoU coverage on unseen YCB objects while just being trained on primitive shapes. Project Webpage: https://prg.cs.umd.edu/AcTExplore <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08745v3-abstract-full').style.display = 'none'; document.getElementById('2310.08745v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures, Accepted to ICRA 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.02751">arXiv:2310.02751</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.02751">pdf</a>, <a href="https://arxiv.org/format/2310.02751">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SHOT: Suppressing the Hessian along the Optimization Trajectory for Gradient-Based Meta-Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">JunHoo Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jayeon Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Kwak%2C+N">Nojun Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.02751v1-abstract-short" style="display: inline;"> In this paper, we hypothesize that gradient-based meta-learning (GBML) implicitly suppresses the Hessian along the optimization trajectory in the inner loop. Based on this hypothesis, we introduce an algorithm called SHOT (Suppressing the Hessian along the Optimization Trajectory) that minimizes the distance between the parameters of the target and reference models to suppress the Hessian in the i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02751v1-abstract-full').style.display = 'inline'; document.getElementById('2310.02751v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.02751v1-abstract-full" style="display: none;"> In this paper, we hypothesize that gradient-based meta-learning (GBML) implicitly suppresses the Hessian along the optimization trajectory in the inner loop. Based on this hypothesis, we introduce an algorithm called SHOT (Suppressing the Hessian along the Optimization Trajectory) that minimizes the distance between the parameters of the target and reference models to suppress the Hessian in the inner loop. Despite dealing with high-order terms, SHOT does not increase the computational complexity of the baseline model much. It is agnostic to both the algorithm and architecture used in GBML, making it highly versatile and applicable to any GBML baseline. To validate the effectiveness of SHOT, we conduct empirical tests on standard few-shot learning tasks and qualitatively analyze its dynamics. We confirm our hypothesis empirically and demonstrate that SHOT outperforms the corresponding baseline. Code is available at: https://github.com/JunHoo-Lee/SHOT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02751v1-abstract-full').style.display = 'none'; document.getElementById('2310.02751v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.01950">arXiv:2309.01950</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.01950">pdf</a>, <a href="https://arxiv.org/format/2309.01950">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> RADIO: Reference-Agnostic Dubbing Video Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+D">Dongyeun Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+C">Chaewon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+S">Sangjoon Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaejun Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+G">Gyeong-Moon Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.01950v2-abstract-short" style="display: inline;"> One of the most challenging problems in audio-driven talking head generation is achieving high-fidelity detail while ensuring precise synchronization. Given only a single reference image, extracting meaningful identity attributes becomes even more challenging, often causing the network to mirror the facial and lip structures too closely. To address these issues, we introduce RADIO, a framework eng&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01950v2-abstract-full').style.display = 'inline'; document.getElementById('2309.01950v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.01950v2-abstract-full" style="display: none;"> One of the most challenging problems in audio-driven talking head generation is achieving high-fidelity detail while ensuring precise synchronization. Given only a single reference image, extracting meaningful identity attributes becomes even more challenging, often causing the network to mirror the facial and lip structures too closely. To address these issues, we introduce RADIO, a framework engineered to yield high-quality dubbed videos regardless of the pose or expression in reference images. The key is to modulate the decoder layers using latent space composed of audio and reference features. Additionally, we incorporate ViT blocks into the decoder to emphasize high-fidelity details, especially in the lip region. Our experimental results demonstrate that RADIO displays high synchronization without the loss of fidelity. Especially in harsh scenarios where the reference frame deviates significantly from the ground truth, our method outperforms state-of-the-art methods, highlighting its robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01950v2-abstract-full').style.display = 'none'; document.getElementById('2309.01950v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.00349">arXiv:2309.00349</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.00349">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bespoke Nanoparticle Synthesis and Chemical Knowledge Discovery Via Autonomous Experimentations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+H+J">Hyuk Jun Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+N">Nayeon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Heeseung Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+D">Daeho Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Ow%2C+L+T+C">Leslie Tiong Ching Ow</a>, <a href="/search/cs?searchtype=author&amp;query=Nam%2C+H">Hyobin Nam</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+C">Chansoo Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+S+Y">Seung Yong Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+K">Kwan-Young Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+D">Donghun Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+S+S">Sang Soo Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.00349v1-abstract-short" style="display: inline;"> The optimization of nanomaterial synthesis using numerous synthetic variables is considered to be extremely laborious task because the conventional combinatorial explorations are prohibitively expensive. In this work, we report an autonomous experimentation platform developed for the bespoke design of nanoparticles (NPs) with targeted optical properties. This platform operates in a closed-loop man&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00349v1-abstract-full').style.display = 'inline'; document.getElementById('2309.00349v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.00349v1-abstract-full" style="display: none;"> The optimization of nanomaterial synthesis using numerous synthetic variables is considered to be extremely laborious task because the conventional combinatorial explorations are prohibitively expensive. In this work, we report an autonomous experimentation platform developed for the bespoke design of nanoparticles (NPs) with targeted optical properties. This platform operates in a closed-loop manner between a batch synthesis module of NPs and a UV- Vis spectroscopy module, based on the feedback of the AI optimization modeling. With silver (Ag) NPs as a representative example, we demonstrate that the Bayesian optimizer implemented with the early stopping criterion can efficiently produce Ag NPs precisely possessing the desired absorption spectra within only 200 iterations (when optimizing among five synthetic reagents). In addition to the outstanding material developmental efficiency, the analysis of synthetic variables further reveals a novel chemistry involving the effects of citrate in Ag NP synthesis. The amount of citrate is a key to controlling the competitions between spherical and plate-shaped NPs and, as a result, affects the shapes of the absorption spectra as well. Our study highlights both capabilities of the platform to enhance search efficiencies and to provide a novel chemical knowledge by analyzing datasets accumulated from the autonomous experimentations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00349v1-abstract-full').style.display = 'none'; document.getElementById('2309.00349v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.14380">arXiv:2308.14380</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.14380">pdf</a>, <a href="https://arxiv.org/format/2308.14380">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervision for Tackling Unsupervised Anomaly Detection: Pitfalls and Opportunities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Akoglu%2C+L">Leman Akoglu</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaemin Yoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.14380v1-abstract-short" style="display: inline;"> Self-supervised learning (SSL) is a growing torrent that has recently transformed machine learning and its many real world applications, by learning on massive amounts of unlabeled data via self-generated supervisory signals. Unsupervised anomaly detection (AD) has also capitalized on SSL, by self-generating pseudo-anomalies through various data augmentation functions or external data exposure. In&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14380v1-abstract-full').style.display = 'inline'; document.getElementById('2308.14380v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.14380v1-abstract-full" style="display: none;"> Self-supervised learning (SSL) is a growing torrent that has recently transformed machine learning and its many real world applications, by learning on massive amounts of unlabeled data via self-generated supervisory signals. Unsupervised anomaly detection (AD) has also capitalized on SSL, by self-generating pseudo-anomalies through various data augmentation functions or external data exposure. In this vision paper, we first underline the importance of the choice of SSL strategies on AD performance, by presenting evidences and studies from the AD literature. Equipped with the understanding that SSL incurs various hyperparameters (HPs) to carefully tune, we present recent developments on unsupervised model selection and augmentation tuning for SSL-based AD. We then highlight emerging challenges and future opportunities; on designing new pretext tasks and augmentation functions for different data modalities, creating novel model selection solutions for systematically tuning the SSL HPs, as well as on capitalizing on the potential of pretrained foundation models on AD through effective density estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14380v1-abstract-full').style.display = 'none'; document.getElementById('2308.14380v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.11568">arXiv:2308.11568</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.11568">pdf</a>, <a href="https://arxiv.org/format/2308.11568">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SPANet: Frequency-balancing Token Mixer using Spectral Pooling Aggregation Modulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yun%2C+G">Guhnoo Yun</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Juhan Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+K">Kijung Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">Jeongho Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+D+H">Dong Hwan Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.11568v1-abstract-short" style="display: inline;"> Recent studies show that self-attentions behave like low-pass filters (as opposed to convolutions) and enhancing their high-pass filtering capability improves model performance. Contrary to this idea, we investigate existing convolution-based models with spectral analysis and observe that improving the low-pass filtering in convolution operations also leads to performance improvement. To account f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11568v1-abstract-full').style.display = 'inline'; document.getElementById('2308.11568v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.11568v1-abstract-full" style="display: none;"> Recent studies show that self-attentions behave like low-pass filters (as opposed to convolutions) and enhancing their high-pass filtering capability improves model performance. Contrary to this idea, we investigate existing convolution-based models with spectral analysis and observe that improving the low-pass filtering in convolution operations also leads to performance improvement. To account for this observation, we hypothesize that utilizing optimal token mixers that capture balanced representations of both high- and low-frequency components can enhance the performance of models. We verify this by decomposing visual features into the frequency domain and combining them in a balanced manner. To handle this, we replace the balancing problem with a mask filtering problem in the frequency domain. Then, we introduce a novel token-mixer named SPAM and leverage it to derive a MetaFormer model termed as SPANet. Experimental results show that the proposed method provides a way to achieve this balance, and the balanced representations of both high- and low-frequency components can improve the performance of models on multiple computer vision tasks. Our code is available at $\href{https://doranlyong.github.io/projects/spanet/}{\text{https://doranlyong.github.io/projects/spanet/}}$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11568v1-abstract-full').style.display = 'none'; document.getElementById('2308.11568v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted paper at ICCV 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.11242">arXiv:2307.11242</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.11242">pdf</a>, <a href="https://arxiv.org/format/2307.11242">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On-Sensor Data Filtering using Neuromorphic Computing for High Energy Physics Experiments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kulkarni%2C+S+R">Shruti R. Kulkarni</a>, <a href="/search/cs?searchtype=author&amp;query=Young%2C+A">Aaron Young</a>, <a href="/search/cs?searchtype=author&amp;query=Date%2C+P">Prasanna Date</a>, <a href="/search/cs?searchtype=author&amp;query=Miniskar%2C+N+R">Narasinga Rao Miniskar</a>, <a href="/search/cs?searchtype=author&amp;query=Vetter%2C+J+S">Jeffrey S. Vetter</a>, <a href="/search/cs?searchtype=author&amp;query=Fahim%2C+F">Farah Fahim</a>, <a href="/search/cs?searchtype=author&amp;query=Parpillon%2C+B">Benjamin Parpillon</a>, <a href="/search/cs?searchtype=author&amp;query=Dickinson%2C+J">Jennet Dickinson</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+N">Nhan Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jieun Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Mills%2C+C">Corrinne Mills</a>, <a href="/search/cs?searchtype=author&amp;query=Swartz%2C+M">Morris Swartz</a>, <a href="/search/cs?searchtype=author&amp;query=Maksimovic%2C+P">Petar Maksimovic</a>, <a href="/search/cs?searchtype=author&amp;query=Schuman%2C+C+D">Catherine D. Schuman</a>, <a href="/search/cs?searchtype=author&amp;query=Bean%2C+A">Alice Bean</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.11242v1-abstract-short" style="display: inline;"> This work describes the investigation of neuromorphic computing-based spiking neural network (SNN) models used to filter data from sensor electronics in high energy physics experiments conducted at the High Luminosity Large Hadron Collider. We present our approach for developing a compact neuromorphic model that filters out the sensor data based on the particle&#39;s transverse momentum with the goal&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11242v1-abstract-full').style.display = 'inline'; document.getElementById('2307.11242v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.11242v1-abstract-full" style="display: none;"> This work describes the investigation of neuromorphic computing-based spiking neural network (SNN) models used to filter data from sensor electronics in high energy physics experiments conducted at the High Luminosity Large Hadron Collider. We present our approach for developing a compact neuromorphic model that filters out the sensor data based on the particle&#39;s transverse momentum with the goal of reducing the amount of data being sent to the downstream electronics. The incoming charge waveforms are converted to streams of binary-valued events, which are then processed by the SNN. We present our insights on the various system design choices - from data encoding to optimal hyperparameters of the training algorithm - for an accurate and compact SNN optimized for hardware deployment. Our results show that an SNN trained with an evolutionary algorithm and an optimized set of hyperparameters obtains a signal efficiency of about 91% with nearly half as many parameters as a deep neural network. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11242v1-abstract-full').style.display = 'none'; document.getElementById('2307.11242v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Manuscript accepted at ICONS&#39;23</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.08263">arXiv:2307.08263</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.08263">pdf</a>, <a href="https://arxiv.org/format/2307.08263">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Spatiotemporal Transformers for Video Object Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jun-Sang Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hongjae Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Jung%2C+S">Seung-Won Jung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.08263v1-abstract-short" style="display: inline;"> This paper presents a novel framework called HST for semi-supervised video object segmentation (VOS). HST extracts image and video features using the latest Swin Transformer and Video Swin Transformer to inherit their inductive bias for the spatiotemporal locality, which is essential for temporally coherent VOS. To take full advantage of the image and video features, HST casts image and video feat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.08263v1-abstract-full').style.display = 'inline'; document.getElementById('2307.08263v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.08263v1-abstract-full" style="display: none;"> This paper presents a novel framework called HST for semi-supervised video object segmentation (VOS). HST extracts image and video features using the latest Swin Transformer and Video Swin Transformer to inherit their inductive bias for the spatiotemporal locality, which is essential for temporally coherent VOS. To take full advantage of the image and video features, HST casts image and video features as a query and memory, respectively. By applying efficient memory read operations at multiple scales, HST produces hierarchical features for the precise reconstruction of object masks. HST shows effectiveness and robustness in handling challenging scenarios with occluded and fast-moving objects under cluttered backgrounds. In particular, HST-B outperforms the state-of-the-art competitors on multiple popular benchmarks, i.e., YouTube-VOS (85.0%), DAVIS 2017 (85.9%), and DAVIS 2016 (94.0%). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.08263v1-abstract-full').style.display = 'none'; document.getElementById('2307.08263v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.06534">arXiv:2307.06534</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.06534">pdf</a>, <a href="https://arxiv.org/format/2307.06534">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DSV: An Alignment Validation Loss for Self-supervised Outlier Model Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+J">Jaemin Yoo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yue Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+L">Lingxiao Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Akoglu%2C+L">Leman Akoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.06534v1-abstract-short" style="display: inline;"> Self-supervised learning (SSL) has proven effective in solving various problems by generating internal supervisory signals. Unsupervised anomaly detection, which faces the high cost of obtaining true labels, is an area that can greatly benefit from SSL. However, recent literature suggests that tuning the hyperparameters (HP) of data augmentation functions is crucial to the success of SSL-based ano&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.06534v1-abstract-full').style.display = 'inline'; document.getElementById('2307.06534v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.06534v1-abstract-full" style="display: none;"> Self-supervised learning (SSL) has proven effective in solving various problems by generating internal supervisory signals. Unsupervised anomaly detection, which faces the high cost of obtaining true labels, is an area that can greatly benefit from SSL. However, recent literature suggests that tuning the hyperparameters (HP) of data augmentation functions is crucial to the success of SSL-based anomaly detection (SSAD), yet a systematic method for doing so remains unknown. In this work, we propose DSV (Discordance and Separability Validation), an unsupervised validation loss to select high-performing detection models with effective augmentation HPs. DSV captures the alignment between an augmentation function and the anomaly-generating mechanism with surrogate losses, which approximate the discordance and separability of test data, respectively. As a result, the evaluation via DSV leads to selecting an effective SSAD model exhibiting better alignment, which results in high detection accuracy. We theoretically derive the degree of approximation conducted by the surrogate losses and empirically show that DSV outperforms a wide range of baselines on 21 real-world tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.06534v1-abstract-full').style.display = 'none'; document.getElementById('2307.06534v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECML PKDD 2023</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Yoo%2C+J&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Yoo%2C+J&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yoo%2C+J&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yoo%2C+J&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Yoo%2C+J&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10