Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 222 results for author: <span class="mathjax">Jeong, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Jeong%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Jeong, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Jeong%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Jeong, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12523">arXiv:2502.12523</a> <span> [<a href="https://arxiv.org/pdf/2502.12523">pdf</a>, <a href="https://arxiv.org/format/2502.12523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Cohesive Subgraph Discovery in Hypergraphs: A Locality-Driven Indexing Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Song Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dahee Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junghoon Kim</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+H+J">Hyun Ji Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jungeun Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12523v1-abstract-short" style="display: inline;"> Hypergraphs are increasingly employed to model complex, diverse relationships in modern networks, effectively capturing higher-order interactions. A critical challenge in this domain is the discovery of cohesive subgraphs, which provides valuable insights into hypergraph structures. However, selecting suitable parameters for this task remains unresolved. To address this, we propose an efficient in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12523v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12523v1-abstract-full" style="display: none;"> Hypergraphs are increasingly employed to model complex, diverse relationships in modern networks, effectively capturing higher-order interactions. A critical challenge in this domain is the discovery of cohesive subgraphs, which provides valuable insights into hypergraph structures. However, selecting suitable parameters for this task remains unresolved. To address this, we propose an efficient indexing framework designed for online retrieval of cohesive subgraphs. Our approach enables rapid identification of desired structures without requiring exhaustive graph traversals, thus ensuring scalability and practicality. This framework has broad applicability, supporting informed decision-making across various domains by offering a comprehensive view of network landscapes. Extensive experiments on real-world datasets demonstrate the effectiveness and efficiency of our proposed indexing technique. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12523v1-abstract-full').style.display = 'none'; document.getElementById('2502.12523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03740">arXiv:2502.03740</a> <span> [<a href="https://arxiv.org/pdf/2502.03740">pdf</a>, <a href="https://arxiv.org/format/2502.03740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multiple Invertible and Partial-Equivariant Function for Latent Vector Transformation to Enhance Disentanglement in VAEs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jung%2C+H">Hee-Jun Jung</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaehyoung Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kangil Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03740v1-abstract-short" style="display: inline;"> Disentanglement learning is a core issue for understanding and re-using trained information in Variational AutoEncoder (VAE), and effective inductive bias has been reported as a key factor. However, the actual implementation of such bias is still vague. In this paper, we propose a novel method, called Multiple Invertible and partial-equivariant transformation (MIPE-transformation), to inject induc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03740v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03740v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03740v1-abstract-full" style="display: none;"> Disentanglement learning is a core issue for understanding and re-using trained information in Variational AutoEncoder (VAE), and effective inductive bias has been reported as a key factor. However, the actual implementation of such bias is still vague. In this paper, we propose a novel method, called Multiple Invertible and partial-equivariant transformation (MIPE-transformation), to inject inductive bias by 1) guaranteeing the invertibility of latent-to-latent vector transformation while preserving a certain portion of equivariance of input-to-latent vector transformation, called Invertible and partial-equivariant transformation (IPE-transformation), 2) extending the form of prior and posterior in VAE frameworks to an unrestricted form through a learnable conversion to an approximated exponential family, called Exponential Family conversion (EF-conversion), and 3) integrating multiple units of IPE-transformation and EF-conversion, and their training. In experiments on 3D Cars, 3D Shapes, and dSprites datasets, MIPE-transformation improves the disentanglement performance of state-of-the-art VAEs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03740v1-abstract-full').style.display = 'none'; document.getElementById('2502.03740v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 21 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12001">arXiv:2501.12001</a> <span> [<a href="https://arxiv.org/pdf/2501.12001">pdf</a>, <a href="https://arxiv.org/format/2501.12001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Conversation Progress Guide : UI System for Enhancing Self-Efficacy in Conversational AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+D">Daeun Jeong</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+S">Sungbok Shin</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongwook Jeong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12001v1-abstract-short" style="display: inline;"> In this study, we introduce the Conversation Progress Guide (CPG), a system designed for text-based conversational AI interactions that provides a visual interface to represent progress. Users often encounter failures when interacting with conversational AI, which can negatively affect their self-efficacy-an individual's belief in their capabilities, reducing their willingness to engage with these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12001v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12001v1-abstract-full" style="display: none;"> In this study, we introduce the Conversation Progress Guide (CPG), a system designed for text-based conversational AI interactions that provides a visual interface to represent progress. Users often encounter failures when interacting with conversational AI, which can negatively affect their self-efficacy-an individual's belief in their capabilities, reducing their willingness to engage with these services. The CPG offers visual feedback on task progress, providing users with mastery experiences, a key source of self-efficacy. To evaluate the system's effectiveness, we conducted a user study assessing how the integration of the CPG influences user engagement and self-efficacy. Results demonstrate that users interacting with a conversational AI enhanced by the CPG showed significant improvements in self-efficacy measures compared to those using a conventional conversational AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12001v1-abstract-full').style.display = 'none'; document.getElementById('2501.12001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACM CHI2025'</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10609">arXiv:2501.10609</a> <span> [<a href="https://arxiv.org/pdf/2501.10609">pdf</a>, <a href="https://arxiv.org/format/2501.10609">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Universal Discrete Filtering with Lookahead or Delay </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+P">Pumiao Yan</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jiwon Jeong</a>, <a href="/search/cs?searchtype=author&query=Sagan%2C+N">Naomi Sagan</a>, <a href="/search/cs?searchtype=author&query=Weissman%2C+T">Tsachy Weissman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10609v1-abstract-short" style="display: inline;"> We consider the universal discrete filtering problem, where an input sequence generated by an unknown source passes through a discrete memoryless channel, and the goal is to estimate its components based on the output sequence with limited lookahead or delay. We propose and establish the universality of a family of schemes for this setting. These schemes are induced by universal Sequential Probabi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10609v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10609v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10609v1-abstract-full" style="display: none;"> We consider the universal discrete filtering problem, where an input sequence generated by an unknown source passes through a discrete memoryless channel, and the goal is to estimate its components based on the output sequence with limited lookahead or delay. We propose and establish the universality of a family of schemes for this setting. These schemes are induced by universal Sequential Probability Assignments (SPAs), and inherit their computational properties. We show that the schemes induced by LZ78 are practically implementable and well-suited for scenarios with limited computational resources and latency constraints. In passing, we use some of the intermediate results to obtain upper and lower bounds that appear to be new, in the purely Bayesian setting, on the optimal filtering performance in terms, respectively, of the mutual information between the noise-free and noisy sequence, and the entropy of the noise-free sequence causally conditioned on the noisy one. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10609v1-abstract-full').style.display = 'none'; document.getElementById('2501.10609v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03714">arXiv:2501.03714</a> <span> [<a href="https://arxiv.org/pdf/2501.03714">pdf</a>, <a href="https://arxiv.org/format/2501.03714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MoDec-GS: Global-to-Local Motion Decomposition and Temporal Interval Adjustment for Compact Dynamic 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kwak%2C+S">Sangwoon Kwak</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Joonsoo Kim</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J+Y">Jun Young Jeong</a>, <a href="/search/cs?searchtype=author&query=Cheong%2C+W">Won-Sik Cheong</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+J">Jihyong Oh</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Munchurl Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03714v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has made significant strides in scene representation and neural rendering, with intense efforts focused on adapting it for dynamic scenes. Despite delivering remarkable rendering quality and speed, existing methods struggle with storage demands and representing complex real-world motions. To tackle these issues, we propose MoDecGS, a memory-efficient Gaussian splatting… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03714v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03714v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03714v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has made significant strides in scene representation and neural rendering, with intense efforts focused on adapting it for dynamic scenes. Despite delivering remarkable rendering quality and speed, existing methods struggle with storage demands and representing complex real-world motions. To tackle these issues, we propose MoDecGS, a memory-efficient Gaussian splatting framework designed for reconstructing novel views in challenging scenarios with complex motions. We introduce GlobaltoLocal Motion Decomposition (GLMD) to effectively capture dynamic motions in a coarsetofine manner. This approach leverages Global Canonical Scaffolds (Global CS) and Local Canonical Scaffolds (Local CS), extending static Scaffold representation to dynamic video reconstruction. For Global CS, we propose Global Anchor Deformation (GAD) to efficiently represent global dynamics along complex motions, by directly deforming the implicit Scaffold attributes which are anchor position, offset, and local context features. Next, we finely adjust local motions via the Local Gaussian Deformation (LGD) of Local CS explicitly. Additionally, we introduce Temporal Interval Adjustment (TIA) to automatically control the temporal coverage of each Local CS during training, allowing MoDecGS to find optimal interval assignments based on the specified number of temporal segments. Extensive evaluations demonstrate that MoDecGS achieves an average 70% reduction in model size over stateoftheart methods for dynamic 3D Gaussians from realworld dynamic videos while maintaining or even improving rendering quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03714v1-abstract-full').style.display = 'none'; document.getElementById('2501.03714v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The last two authors are co-corresponding authors. Please visit our project page at https://kaist-viclab.github.io/MoDecGS-site/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12629">arXiv:2412.12629</a> <span> [<a href="https://arxiv.org/pdf/2412.12629">pdf</a>, <a href="https://arxiv.org/format/2412.12629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> a2z-1 for Multi-Disease Detection in Abdomen-Pelvis CT: External Validation and Performance Analysis Across 21 Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rajpurkar%2C+P">Pranav Rajpurkar</a>, <a href="/search/cs?searchtype=author&query=Acosta%2C+J+N">Julian N. Acosta</a>, <a href="/search/cs?searchtype=author&query=Dogra%2C+S">Siddhant Dogra</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaehwan Jeong</a>, <a href="/search/cs?searchtype=author&query=Jindal%2C+D">Deepanshu Jindal</a>, <a href="/search/cs?searchtype=author&query=Moritz%2C+M">Michael Moritz</a>, <a href="/search/cs?searchtype=author&query=Rajpurkar%2C+S">Samir Rajpurkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12629v1-abstract-short" style="display: inline;"> We present a comprehensive evaluation of a2z-1, an artificial intelligence (AI) model designed to analyze abdomen-pelvis CT scans for 21 time-sensitive and actionable findings. Our study focuses on rigorous assessment of the model's performance and generalizability. Large-scale retrospective analysis demonstrates an average AUC of 0.931 across 21 conditions. External validation across two distinct… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12629v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12629v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12629v1-abstract-full" style="display: none;"> We present a comprehensive evaluation of a2z-1, an artificial intelligence (AI) model designed to analyze abdomen-pelvis CT scans for 21 time-sensitive and actionable findings. Our study focuses on rigorous assessment of the model's performance and generalizability. Large-scale retrospective analysis demonstrates an average AUC of 0.931 across 21 conditions. External validation across two distinct health systems confirms consistent performance (AUC 0.923), establishing generalizability to different evaluation scenarios, with notable performance in critical findings such as small bowel obstruction (AUC 0.958) and acute pancreatitis (AUC 0.961). Subgroup analysis shows consistent accuracy across patient sex, age groups, and varied imaging protocols, including different slice thicknesses and contrast administration types. Comparison of high-confidence model outputs to radiologist reports reveals instances where a2z-1 identified overlooked findings, suggesting potential for quality assurance applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12629v1-abstract-full').style.display = 'none'; document.getElementById('2412.12629v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11463">arXiv:2412.11463</a> <span> [<a href="https://arxiv.org/pdf/2412.11463">pdf</a>, <a href="https://arxiv.org/format/2412.11463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FedCAR: Cross-client Adaptive Re-weighting for Generative Models in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minjun Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minjee Kim</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jinhoon Jeong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11463v1-abstract-short" style="display: inline;"> Generative models trained on multi-institutional datasets can provide an enriched understanding through diverse data distributions. However, training the models on medical images is often challenging due to hospitals' reluctance to share data for privacy reasons. Federated learning(FL) has emerged as a privacy-preserving solution for training distributed datasets across data centers by aggregating… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11463v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11463v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11463v1-abstract-full" style="display: none;"> Generative models trained on multi-institutional datasets can provide an enriched understanding through diverse data distributions. However, training the models on medical images is often challenging due to hospitals' reluctance to share data for privacy reasons. Federated learning(FL) has emerged as a privacy-preserving solution for training distributed datasets across data centers by aggregating model weights from multiple clients instead of sharing raw data. Previous research has explored the adaptation of FL to generative models, yet effective aggregation algorithms specifically tailored for generative models remain unexplored. We hereby propose a novel algorithm aimed at improving the performance of generative models within FL. Our approach adaptively re-weights the contribution of each client, resulting in well-trained shared parameters. In each round, the server side measures the distribution distance between fake images generated by clients instead of directly comparing the Fr茅chet Inception Distance per client, thereby enhancing efficiency of the learning. Experimental results on three public chest X-ray datasets show superior performance in medical image generation, outperforming both centralized learning and conventional FL algorithms. Our code is available at https://github.com/danny0628/FedCAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11463v1-abstract-full').style.display = 'none'; document.getElementById('2412.11463v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09921">arXiv:2412.09921</a> <span> [<a href="https://arxiv.org/pdf/2412.09921">pdf</a>, <a href="https://arxiv.org/format/2412.09921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FaceShield: Defending Facial Image against Deepfake Threats </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaehwan Jeong</a>, <a href="/search/cs?searchtype=author&query=In%2C+S">Sumin In</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sieun Kim</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+H">Hannie Shin</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongheon Jeong</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+S+H">Sang Ho Yoon</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J">Jaewook Chung</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sangpil Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09921v1-abstract-short" style="display: inline;"> The rising use of deepfakes in criminal activities presents a significant issue, inciting widespread controversy. While numerous studies have tackled this problem, most primarily focus on deepfake detection. These reactive solutions are insufficient as a fundamental approach for crimes where authenticity verification is not critical. Existing proactive defenses also have limitations, as they are e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09921v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09921v1-abstract-full" style="display: none;"> The rising use of deepfakes in criminal activities presents a significant issue, inciting widespread controversy. While numerous studies have tackled this problem, most primarily focus on deepfake detection. These reactive solutions are insufficient as a fundamental approach for crimes where authenticity verification is not critical. Existing proactive defenses also have limitations, as they are effective only for deepfake models based on specific Generative Adversarial Networks (GANs), making them less applicable in light of recent advancements in diffusion-based models. In this paper, we propose a proactive defense method named FaceShield, which introduces novel attack strategies targeting deepfakes generated by Diffusion Models (DMs) and facilitates attacks on various existing GAN-based deepfake models through facial feature extractor manipulations. Our approach consists of three main components: (i) manipulating the attention mechanism of DMs to exclude protected facial features during the denoising process, (ii) targeting prominent facial feature extraction models to enhance the robustness of our adversarial perturbation, and (iii) employing Gaussian blur and low-pass filtering techniques to improve imperceptibility while enhancing robustness against JPEG distortion. Experimental results on the CelebA-HQ and VGGFace2-HQ datasets demonstrate that our method achieves state-of-the-art performance against the latest deepfake models based on DMs, while also exhibiting applicability to GANs and showcasing greater imperceptibility of noise along with enhanced robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09921v1-abstract-full').style.display = 'none'; document.getElementById('2412.09921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07333">arXiv:2412.07333</a> <span> [<a href="https://arxiv.org/pdf/2412.07333">pdf</a>, <a href="https://arxiv.org/format/2412.07333">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fusion Embedding for Pose-Guided Person Image Synthesis with Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+D">Donghwna Lee</a>, <a href="/search/cs?searchtype=author&query=Min%2C+K">Kyungha Min</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kirok Kim</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+S">Seyoung Jeong</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jiwoo Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+W">Wooju Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07333v1-abstract-short" style="display: inline;"> Pose-Guided Person Image Synthesis (PGPIS) aims to synthesize high-quality person images corresponding to target poses while preserving the appearance of the source image. Recently, PGPIS methods that use diffusion models have achieved competitive performance. Most approaches involve extracting representations of the target pose and source image and learning their relationships in the generative m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07333v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07333v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07333v1-abstract-full" style="display: none;"> Pose-Guided Person Image Synthesis (PGPIS) aims to synthesize high-quality person images corresponding to target poses while preserving the appearance of the source image. Recently, PGPIS methods that use diffusion models have achieved competitive performance. Most approaches involve extracting representations of the target pose and source image and learning their relationships in the generative model's training process. This approach makes it difficult to learn the semantic relationships between the input and target images and complicates the model structure needed to enhance generation results. To address these issues, we propose Fusion embedding for PGPIS using a Diffusion Model (FPDM). Inspired by the successful application of pre-trained CLIP models in text-to-image diffusion models, our method consists of two stages. The first stage involves training the fusion embedding of the source image and target pose to align with the target image's embedding. In the second stage, the generative model uses this fusion embedding as a condition to generate the target image. We applied the proposed method to the benchmark datasets DeepFashion and RWTH-PHOENIX-Weather 2014T, and conducted both quantitative and qualitative evaluations, demonstrating state-of-the-art (SOTA) performance. An ablation study of the model structure showed that even a model using only the second stage achieved performance close to the other PGPIS SOTA models. The code is available at https://github.com/dhlee-work/FPDM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07333v1-abstract-full').style.display = 'none'; document.getElementById('2412.07333v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03162">arXiv:2412.03162</a> <span> [<a href="https://arxiv.org/pdf/2412.03162">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> LLM-Mirror: A Generated-Persona Approach for Survey Pre-Testing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sunwoong Kim</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongho Jeong</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J+S">Jin Soo Han</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+D">Donghyuk Shin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03162v2-abstract-short" style="display: inline;"> Surveys are widely used in social sciences to understand human behavior, but their implementation often involves iterative adjustments that demand significant effort and resources. To this end, researchers have increasingly turned to large language models (LLMs) to simulate human behavior. While existing studies have focused on distributional similarities, individual-level comparisons remain under… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03162v2-abstract-full').style.display = 'inline'; document.getElementById('2412.03162v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03162v2-abstract-full" style="display: none;"> Surveys are widely used in social sciences to understand human behavior, but their implementation often involves iterative adjustments that demand significant effort and resources. To this end, researchers have increasingly turned to large language models (LLMs) to simulate human behavior. While existing studies have focused on distributional similarities, individual-level comparisons remain underexplored. Building upon prior work, we investigate whether providing LLMs with respondents' prior information can replicate both statistical distributions and individual decision-making patterns using Partial Least Squares Structural Equation Modeling (PLS-SEM), a well-established causal analysis method. We also introduce the concept of the LLM-Mirror, user personas generated by supplying respondent-specific information to the LLM. By comparing responses generated by the LLM-Mirror with actual individual survey responses, we assess its effectiveness in replicating individual-level outcomes. Our findings show that: (1) PLS-SEM analysis shows LLM-generated responses align with human responses, (2) LLMs, when provided with respondent-specific information, are capable of reproducing individual human responses, and (3) LLM-Mirror responses closely follow human responses at the individual level. These findings highlight the potential of LLMs as a complementary tool for pre-testing surveys and optimizing research design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03162v2-abstract-full').style.display = 'none'; document.getElementById('2412.03162v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08933">arXiv:2411.08933</a> <span> [<a href="https://arxiv.org/pdf/2411.08933">pdf</a>, <a href="https://arxiv.org/format/2411.08933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Confidence-aware Denoised Fine-tuning of Off-the-shelf Models for Certified Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jang%2C+S">Suhyeok Jang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seojin Kim</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+J">Jinwoo Shin</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongheon Jeong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08933v2-abstract-short" style="display: inline;"> The remarkable advances in deep learning have led to the emergence of many off-the-shelf classifiers, e.g., large pre-trained models. However, since they are typically trained on clean data, they remain vulnerable to adversarial attacks. Despite this vulnerability, their superior performance and transferability make off-the-shelf classifiers still valuable in practice, demanding further work to pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08933v2-abstract-full').style.display = 'inline'; document.getElementById('2411.08933v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08933v2-abstract-full" style="display: none;"> The remarkable advances in deep learning have led to the emergence of many off-the-shelf classifiers, e.g., large pre-trained models. However, since they are typically trained on clean data, they remain vulnerable to adversarial attacks. Despite this vulnerability, their superior performance and transferability make off-the-shelf classifiers still valuable in practice, demanding further work to provide adversarial robustness for them in a post-hoc manner. A recently proposed method, denoised smoothing, leverages a denoiser model in front of the classifier to obtain provable robustness without additional training. However, the denoiser often creates hallucination, i.e., images that have lost the semantics of their originally assigned class, leading to a drop in robustness. Furthermore, its noise-and-denoise procedure introduces a significant distribution shift from the original distribution, causing the denoised smoothing framework to achieve sub-optimal robustness. In this paper, we introduce Fine-Tuning with Confidence-Aware Denoised Image Selection (FT-CADIS), a novel fine-tuning scheme to enhance the certified robustness of off-the-shelf classifiers. FT-CADIS is inspired by the observation that the confidence of off-the-shelf classifiers can effectively identify hallucinated images during denoised smoothing. Based on this, we develop a confidence-aware training objective to handle such hallucinated images and improve the stability of fine-tuning from denoised images. In this way, the classifier can be fine-tuned using only images that are beneficial for adversarial robustness. We also find that such a fine-tuning can be done by updating a small fraction of parameters of the classifier. Extensive experiments demonstrate that FT-CADIS has established the state-of-the-art certified robustness among denoised smoothing methods across all $\ell_2$-adversary radius in various benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08933v2-abstract-full').style.display = 'none'; document.getElementById('2411.08933v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages; TMLR 2024; Code is available at https://github.com/suhyeok24/FT-CADIS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01281">arXiv:2411.01281</a> <span> [<a href="https://arxiv.org/pdf/2411.01281">pdf</a>, <a href="https://arxiv.org/format/2411.01281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Varco Arena: A Tournament Approach to Reference-Free Benchmarking Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Son%2C+S">Seonil Son</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+J">Ju-Min Oh</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Heegon Jin</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+C">Cheolhun Jang</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jeongbeom Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kuntae Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01281v3-abstract-short" style="display: inline;"> Most existing benchmarking approaches for evaluating the output quality of large language models (LLMs) rely on comparing LLM responses to predefined references. Such methods, based on static datasets, quickly become outdated as LLM capabilities and use cases evolve. In this work, we introduce VARCO Arena--a novel, cost-effective, and robust benchmarking approach that leverages a single-eliminatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01281v3-abstract-full').style.display = 'inline'; document.getElementById('2411.01281v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01281v3-abstract-full" style="display: none;"> Most existing benchmarking approaches for evaluating the output quality of large language models (LLMs) rely on comparing LLM responses to predefined references. Such methods, based on static datasets, quickly become outdated as LLM capabilities and use cases evolve. In this work, we introduce VARCO Arena--a novel, cost-effective, and robust benchmarking approach that leverages a single-elimination tournament structure to minimize the number of required comparisons while eliminating the need for static references or costly human annotations. We validate our approach through two experiments: (i) a simulation study that examines its robustness under various conditions, and (ii) an empirical evaluation using publicly available benchmark prompts. In both experiments, VARCO Arena consistently outperforms current LLM benchmarking practices, achieving stronger correlations with human-established Elo ratings. Our results demonstrate that VARCO Arena not only produces reliable LLM rankings but also provides a scalable, adaptable solution for qualitative evaluation across diverse, customized use cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01281v3-abstract-full').style.display = 'none'; document.getElementById('2411.01281v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages for main body, 17 pages in total</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00626">arXiv:2411.00626</a> <span> [<a href="https://arxiv.org/pdf/2411.00626">pdf</a>, <a href="https://arxiv.org/format/2411.00626">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ZIM: Zero-Shot Image Matting for Anything </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+B">Beomyoung Kim</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+C">Chanyong Shin</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Joonhyun Jeong</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+H">Hyungsik Jung</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Se-Yun Lee</a>, <a href="/search/cs?searchtype=author&query=Chun%2C+S">Sewhan Chun</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+D">Dong-Hyun Hwang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Joonsang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00626v1-abstract-short" style="display: inline;"> The recent segmentation foundation model, Segment Anything Model (SAM), exhibits strong zero-shot segmentation capabilities, but it falls short in generating fine-grained precise masks. To address this limitation, we propose a novel zero-shot image matting model, called ZIM, with two key contributions: First, we develop a label converter that transforms segmentation labels into detailed matte labe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00626v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00626v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00626v1-abstract-full" style="display: none;"> The recent segmentation foundation model, Segment Anything Model (SAM), exhibits strong zero-shot segmentation capabilities, but it falls short in generating fine-grained precise masks. To address this limitation, we propose a novel zero-shot image matting model, called ZIM, with two key contributions: First, we develop a label converter that transforms segmentation labels into detailed matte labels, constructing the new SA1B-Matte dataset without costly manual annotations. Training SAM with this dataset enables it to generate precise matte masks while maintaining its zero-shot capability. Second, we design the zero-shot matting model equipped with a hierarchical pixel decoder to enhance mask representation, along with a prompt-aware masked attention mechanism to improve performance by enabling the model to focus on regions specified by visual prompts. We evaluate ZIM using the newly introduced MicroMat-3K test set, which contains high-quality micro-level matte labels. Experimental results show that ZIM outperforms existing methods in fine-grained mask generation and zero-shot generalization. Furthermore, we demonstrate the versatility of ZIM in various downstream tasks requiring precise masks, such as image inpainting and 3D NeRF. Our contributions provide a robust foundation for advancing zero-shot matting and its downstream applications across a wide range of computer vision tasks. The code is available at \url{https://github.com/naver-ai/ZIM}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00626v1-abstract-full').style.display = 'none'; document.getElementById('2411.00626v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint (21 pages, 16 figures, and 8 tables)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07832">arXiv:2410.07832</a> <span> [<a href="https://arxiv.org/pdf/2410.07832">pdf</a>, <a href="https://arxiv.org/format/2410.07832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LaB-CL: Localized and Balanced Contrastive Learning for improving parking slot detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+U+J">U Jin Jeong</a>, <a href="/search/cs?searchtype=author&query=Roh%2C+S">Sumin Roh</a>, <a href="/search/cs?searchtype=author&query=Chun%2C+I+Y">Il Yong Chun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07832v1-abstract-short" style="display: inline;"> Parking slot detection is an essential technology in autonomous parking systems. In general, the classification problem of parking slot detection consists of two tasks, a task determining whether localized candidates are junctions of parking slots or not, and the other that identifies a shape of detected junctions. Both classification tasks can easily face biased learning toward the majority class… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07832v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07832v1-abstract-full" style="display: none;"> Parking slot detection is an essential technology in autonomous parking systems. In general, the classification problem of parking slot detection consists of two tasks, a task determining whether localized candidates are junctions of parking slots or not, and the other that identifies a shape of detected junctions. Both classification tasks can easily face biased learning toward the majority class, degrading classification performances. Yet, the data imbalance issue has been overlooked in parking slot detection. We propose the first supervised contrastive learning framework for parking slot detection, Localized and Balanced Contrastive Learning for improving parking slot detection (LaB-CL). The proposed LaB-CL framework uses two main approaches. First, we propose to include class prototypes to consider representations from all classes in every mini batch, from the local perspective. Second, we propose a new hard negative sampling scheme that selects local representations with high prediction error. Experiments with the benchmark dataset demonstrate that the proposed LaB-CL framework can outperform existing parking slot detection methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07832v1-abstract-full').style.display = 'none'; document.getElementById('2410.07832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06940">arXiv:2410.06940</a> <span> [<a href="https://arxiv.org/pdf/2410.06940">pdf</a>, <a href="https://arxiv.org/format/2410.06940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Representation Alignment for Generation: Training Diffusion Transformers Is Easier Than You Think </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+S">Sihyun Yu</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+S">Sangkyung Kwak</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+H">Huiwon Jang</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongheon Jeong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jonathan Huang</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+J">Jinwoo Shin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06940v2-abstract-short" style="display: inline;"> Recent studies have shown that the denoising process in (generative) diffusion models can induce meaningful (discriminative) representations inside the model, though the quality of these representations still lags behind those learned through recent self-supervised learning methods. We argue that one main bottleneck in training large-scale diffusion models for generation lies in effectively learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06940v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06940v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06940v2-abstract-full" style="display: none;"> Recent studies have shown that the denoising process in (generative) diffusion models can induce meaningful (discriminative) representations inside the model, though the quality of these representations still lags behind those learned through recent self-supervised learning methods. We argue that one main bottleneck in training large-scale diffusion models for generation lies in effectively learning these representations. Moreover, training can be made easier by incorporating high-quality external visual representations, rather than relying solely on the diffusion models to learn them independently. We study this by introducing a straightforward regularization called REPresentation Alignment (REPA), which aligns the projections of noisy input hidden states in denoising networks with clean image representations obtained from external, pretrained visual encoders. The results are striking: our simple strategy yields significant improvements in both training efficiency and generation quality when applied to popular diffusion and flow-based transformers, such as DiTs and SiTs. For instance, our method can speed up SiT training by over 17.5$\times$, matching the performance (without classifier-free guidance) of a SiT-XL model trained for 7M steps in less than 400K steps. In terms of final generation quality, our approach achieves state-of-the-art results of FID=1.42 using classifier-free guidance with the guidance interval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06940v2-abstract-full').style.display = 'none'; document.getElementById('2410.06940v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Project page: https://sihyun.me/REPA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05694">arXiv:2410.05694</a> <span> [<a href="https://arxiv.org/pdf/2410.05694">pdf</a>, <a href="https://arxiv.org/format/2410.05694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiffusionGuard: A Robust Defense Against Malicious Diffusion-based Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+J+S">June Suk Choi</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kyungmin Lee</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongheon Jeong</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+J">Jinwoo Shin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kimin Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05694v1-abstract-short" style="display: inline;"> Recent advances in diffusion models have introduced a new era of text-guided image manipulation, enabling users to create realistic edited images with simple textual prompts. However, there is significant concern about the potential misuse of these methods, especially in creating misleading or harmful content. Although recent defense strategies, which introduce imperceptible adversarial noise to i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05694v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05694v1-abstract-full" style="display: none;"> Recent advances in diffusion models have introduced a new era of text-guided image manipulation, enabling users to create realistic edited images with simple textual prompts. However, there is significant concern about the potential misuse of these methods, especially in creating misleading or harmful content. Although recent defense strategies, which introduce imperceptible adversarial noise to induce model failure, have shown promise, they remain ineffective against more sophisticated manipulations, such as editing with a mask. In this work, we propose DiffusionGuard, a robust and effective defense method against unauthorized edits by diffusion-based image editing models, even in challenging setups. Through a detailed analysis of these models, we introduce a novel objective that generates adversarial noise targeting the early stage of the diffusion process. This approach significantly improves the efficiency and effectiveness of adversarial noises. We also introduce a mask-augmentation technique to enhance robustness against various masks during test time. Finally, we introduce a comprehensive benchmark designed to evaluate the effectiveness and robustness of methods in protecting against privacy threats in realistic scenarios. Through extensive experiments, we show that our method achieves stronger protection and improved mask robustness with lower computational costs compared to the strongest baseline. Additionally, our method exhibits superior transferability and better resilience to noise removal techniques compared to all baseline methods. Our source code is publicly available at https://github.com/choi403/DiffusionGuard. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05694v1-abstract-full').style.display = 'none'; document.getElementById('2410.05694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09883">arXiv:2409.09883</a> <span> [<a href="https://arxiv.org/pdf/2409.09883">pdf</a>, <a href="https://arxiv.org/format/2409.09883">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Robots that Suggest Safe Alternatives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+H+J">Hyun Joe Jeong</a>, <a href="/search/cs?searchtype=author&query=Bajcsy%2C+A">Andrea Bajcsy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09883v1-abstract-short" style="display: inline;"> Goal-conditioned policies, such as those learned via imitation learning, provide an easy way for humans to influence what tasks robots accomplish. However, these robot policies are not guaranteed to execute safely or to succeed when faced with out-of-distribution requests. In this work, we enable robots to know when they can confidently execute a user's desired goal, and automatically suggest safe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09883v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09883v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09883v1-abstract-full" style="display: none;"> Goal-conditioned policies, such as those learned via imitation learning, provide an easy way for humans to influence what tasks robots accomplish. However, these robot policies are not guaranteed to execute safely or to succeed when faced with out-of-distribution requests. In this work, we enable robots to know when they can confidently execute a user's desired goal, and automatically suggest safe alternatives when they cannot. Our approach is inspired by control-theoretic safety filtering, wherein a safety filter minimally adjusts a robot's candidate action to be safe. Our key idea is to pose alternative suggestion as a safe control problem in goal space, rather than in action space. Offline, we use reachability analysis to compute a goal-parameterized reach-avoid value network which quantifies the safety and liveness of the robot's pre-trained policy. Online, our robot uses the reach-avoid value network as a safety filter, monitoring the human's given goal and actively suggesting alternatives that are similar but meet the safety specification. We demonstrate our Safe ALTernatives (SALT) framework in simulation experiments with indoor navigation and Franka Panda tabletop manipulation, and with both discrete and continuous goal representations. We find that SALT is able to learn to predict successful and failed closed-loop executions, is a less pessimistic monitor than open-loop uncertainty quantification, and proposes alternatives that consistently align with those people find acceptable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09883v1-abstract-full').style.display = 'none'; document.getElementById('2409.09883v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures, 2 tables, submitted to ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05662">arXiv:2409.05662</a> <span> [<a href="https://arxiv.org/pdf/2409.05662">pdf</a>, <a href="https://arxiv.org/format/2409.05662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Human Action Recognition on Embedded Platforms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zichen Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Peiqi Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingzhen Li</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaehwan Jeong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yihang Xu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yejin Lee</a>, <a href="/search/cs?searchtype=author&query=Baum%2C+C+M">Carolyn M. Baum</a>, <a href="/search/cs?searchtype=author&query=Connor%2C+L+T">Lisa Tabor Connor</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chenyang Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05662v2-abstract-short" style="display: inline;"> With advancements in computer vision and deep learning, video-based human action recognition (HAR) has become practical. However, due to the complexity of the computation pipeline, running HAR on live video streams incurs excessive delays on embedded platforms. This work tackles the real-time performance challenges of HAR with four contributions: 1) an experimental study identifying a standard Opt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05662v2-abstract-full').style.display = 'inline'; document.getElementById('2409.05662v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05662v2-abstract-full" style="display: none;"> With advancements in computer vision and deep learning, video-based human action recognition (HAR) has become practical. However, due to the complexity of the computation pipeline, running HAR on live video streams incurs excessive delays on embedded platforms. This work tackles the real-time performance challenges of HAR with four contributions: 1) an experimental study identifying a standard Optical Flow (OF) extraction technique as the latency bottleneck in a state-of-the-art HAR pipeline, 2) an exploration of the latency-accuracy tradeoff between the standard and deep learning approaches to OF extraction, which highlights the need for a novel, efficient motion feature extractor, 3) the design of Integrated Motion Feature Extractor (IMFE), a novel single-shot neural network architecture for motion feature extraction with drastic improvement in latency, 4) the development of RT-HARE, a real-time HAR system tailored for embedded platforms. Experimental results on an Nvidia Jetson Xavier NX platform demonstrated that RT-HARE realizes real-time HAR at a video frame rate of 30 frames per second while delivering high levels of recognition accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05662v2-abstract-full').style.display = 'none'; document.getElementById('2409.05662v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14916">arXiv:2408.14916</a> <span> [<a href="https://arxiv.org/pdf/2408.14916">pdf</a>, <a href="https://arxiv.org/format/2408.14916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Real-world Event-guided Low-light Video Enhancement and Deblurring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taewoo Kim</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaeseok Jeong</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+H">Hoonhee Cho</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+Y">Yuhwan Jeong</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14916v1-abstract-short" style="display: inline;"> In low-light conditions, capturing videos with frame-based cameras often requires long exposure times, resulting in motion blur and reduced visibility. While frame-based motion deblurring and low-light enhancement have been studied, they still pose significant challenges. Event cameras have emerged as a promising solution for improving image quality in low-light environments and addressing motion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14916v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14916v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14916v1-abstract-full" style="display: none;"> In low-light conditions, capturing videos with frame-based cameras often requires long exposure times, resulting in motion blur and reduced visibility. While frame-based motion deblurring and low-light enhancement have been studied, they still pose significant challenges. Event cameras have emerged as a promising solution for improving image quality in low-light environments and addressing motion blur. They provide two key advantages: capturing scene details well even in low light due to their high dynamic range, and effectively capturing motion information during long exposures due to their high temporal resolution. Despite efforts to tackle low-light enhancement and motion deblurring using event cameras separately, previous work has not addressed both simultaneously. To explore the joint task, we first establish real-world datasets for event-guided low-light enhancement and deblurring using a hybrid camera system based on beam splitters. Subsequently, we introduce an end-to-end framework to effectively handle these tasks. Our framework incorporates a module to efficiently leverage temporal information from events and frames. Furthermore, we propose a module to utilize cross-modal feature information to employ a low-pass filter for noise suppression while enhancing the main structural information. Our proposed method significantly outperforms existing approaches in addressing the joint task. Our project pages are available at https://github.com/intelpro/ELEDNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14916v1-abstract-full').style.display = 'none'; document.getElementById('2408.14916v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ECCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21448">arXiv:2407.21448</a> <span> [<a href="https://arxiv.org/pdf/2407.21448">pdf</a>, <a href="https://arxiv.org/format/2407.21448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Image Super-Resolution Networks with Pixel-Level Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jinho Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jinwoo Kim</a>, <a href="/search/cs?searchtype=author&query=Jo%2C+Y">Younghyun Jo</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+J">Seon Joo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21448v1-abstract-short" style="display: inline;"> In recent times, the need for effective super-resolution (SR) techniques has surged, especially for large-scale images ranging 2K to 8K resolutions. For DNN-based SISR, decomposing images into overlapping patches is typically necessary due to computational constraints. In such patch-decomposing scheme, one can allocate computational resources differently based on each patch's difficulty to further… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21448v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21448v1-abstract-full" style="display: none;"> In recent times, the need for effective super-resolution (SR) techniques has surged, especially for large-scale images ranging 2K to 8K resolutions. For DNN-based SISR, decomposing images into overlapping patches is typically necessary due to computational constraints. In such patch-decomposing scheme, one can allocate computational resources differently based on each patch's difficulty to further improve efficiency while maintaining SR performance. However, this approach has a limitation: computational resources is uniformly allocated within a patch, leading to lower efficiency when the patch contain pixels with varying levels of restoration difficulty. To address the issue, we propose the Pixel-level Classifier for Single Image Super-Resolution (PCSR), a novel method designed to distribute computational resources adaptively at the pixel level. A PCSR model comprises a backbone, a pixel-level classifier, and a set of pixel-level upsamplers with varying capacities. The pixel-level classifier assigns each pixel to an appropriate upsampler based on its restoration difficulty, thereby optimizing computational resource usage. Our method allows for performance and computational cost balance during inference without re-training. Our experiments demonstrate PCSR's advantage over existing patch-distributing methods in PSNR-FLOP trade-offs across different backbone models and benchmarks. The code is available at https://github.com/3587jjh/PCSR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21448v1-abstract-full').style.display = 'none'; document.getElementById('2407.21448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20657">arXiv:2407.20657</a> <span> [<a href="https://arxiv.org/pdf/2407.20657">pdf</a>, <a href="https://arxiv.org/format/2407.20657">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Prompt-Driven Contrastive Learning for Transferable Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hunmin Yang</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongoh Jeong</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20657v1-abstract-short" style="display: inline;"> Recent vision-language foundation models, such as CLIP, have demonstrated superior capabilities in learning representations that can be transferable across diverse range of downstream tasks and domains. With the emergence of such powerful models, it has become crucial to effectively leverage their capabilities in tackling challenging vision tasks. On the other hand, only a few works have focused o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20657v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20657v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20657v1-abstract-full" style="display: none;"> Recent vision-language foundation models, such as CLIP, have demonstrated superior capabilities in learning representations that can be transferable across diverse range of downstream tasks and domains. With the emergence of such powerful models, it has become crucial to effectively leverage their capabilities in tackling challenging vision tasks. On the other hand, only a few works have focused on devising adversarial examples that transfer well to both unknown domains and model architectures. In this paper, we propose a novel transfer attack method called PDCL-Attack, which leverages the CLIP model to enhance the transferability of adversarial perturbations generated by a generative model-based attack framework. Specifically, we formulate an effective prompt-driven feature guidance by harnessing the semantic representation power of text, particularly from the ground-truth class labels of input images. To the best of our knowledge, we are the first to introduce prompt learning to enhance the transferable generative attacks. Extensive experiments conducted across various cross-domain and cross-model settings empirically validate our approach, demonstrating its superiority over state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20657v1-abstract-full').style.display = 'none'; document.getElementById('2407.20657v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024, Project Page: https://PDCL-Attack.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20653">arXiv:2407.20653</a> <span> [<a href="https://arxiv.org/pdf/2407.20653">pdf</a>, <a href="https://arxiv.org/format/2407.20653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FACL-Attack: Frequency-Aware Contrastive Learning for Transferable Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hunmin Yang</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongoh Jeong</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20653v1-abstract-short" style="display: inline;"> Deep neural networks are known to be vulnerable to security risks due to the inherent transferable nature of adversarial examples. Despite the success of recent generative model-based attacks demonstrating strong transferability, it still remains a challenge to design an efficient attack strategy in a real-world strict black-box setting, where both the target domain and model architectures are unk… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20653v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20653v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20653v1-abstract-full" style="display: none;"> Deep neural networks are known to be vulnerable to security risks due to the inherent transferable nature of adversarial examples. Despite the success of recent generative model-based attacks demonstrating strong transferability, it still remains a challenge to design an efficient attack strategy in a real-world strict black-box setting, where both the target domain and model architectures are unknown. In this paper, we seek to explore a feature contrastive approach in the frequency domain to generate adversarial examples that are robust in both cross-domain and cross-model settings. With that goal in mind, we propose two modules that are only employed during the training phase: a Frequency-Aware Domain Randomization (FADR) module to randomize domain-variant low- and high-range frequency components and a Frequency-Augmented Contrastive Learning (FACL) module to effectively separate domain-invariant mid-frequency features of clean and perturbed image. We demonstrate strong transferability of our generated adversarial perturbations through extensive cross-domain and cross-model experiments, while keeping the inference time complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20653v1-abstract-full').style.display = 'none'; document.getElementById('2407.20653v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI 2024, Project Page: https://FACL-Attack.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18658">arXiv:2407.18658</a> <span> [<a href="https://arxiv.org/pdf/2407.18658">pdf</a>, <a href="https://arxiv.org/format/2407.18658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Robustification via Text-to-Image Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+D">Daewon Choi</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongheon Jeong</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+H">Huiwon Jang</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+J">Jinwoo Shin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18658v1-abstract-short" style="display: inline;"> Adversarial robustness has been conventionally believed as a challenging property to encode for neural networks, requiring plenty of training data. In the recent paradigm of adopting off-the-shelf models, however, access to their training data is often infeasible or not practical, while most of such models are not originally trained concerning adversarial robustness. In this paper, we develop a sc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18658v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18658v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18658v1-abstract-full" style="display: none;"> Adversarial robustness has been conventionally believed as a challenging property to encode for neural networks, requiring plenty of training data. In the recent paradigm of adopting off-the-shelf models, however, access to their training data is often infeasible or not practical, while most of such models are not originally trained concerning adversarial robustness. In this paper, we develop a scalable and model-agnostic solution to achieve adversarial robustness without using any data. Our intuition is to view recent text-to-image diffusion models as "adaptable" denoisers that can be optimized to specify target tasks. Based on this, we propose: (a) to initiate a denoise-and-classify pipeline that offers provable guarantees against adversarial attacks, and (b) to leverage a few synthetic reference images generated from the text-to-image model that enables novel adaptation schemes. Our experiments show that our data-free scheme applied to the pre-trained CLIP could improve the (provable) adversarial robustness of its diverse zero-shot classification derivatives (while maintaining their accuracy), significantly surpassing prior approaches that utilize the full training data. Not only for CLIP, we also demonstrate that our framework is easily applicable for robustifying other visual classifiers efficiently. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18658v1-abstract-full').style.display = 'none'; document.getElementById('2407.18658v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/ChoiDae1/robustify-T2I</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04218">arXiv:2407.04218</a> <span> [<a href="https://arxiv.org/pdf/2407.04218">pdf</a>, <a href="https://arxiv.org/format/2407.04218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Batch Transformer: Look for Attention in Batch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Her%2C+M+B">Myung Beom Her</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jisu Jeong</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Hojoon Song</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Ji-Hyeong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04218v1-abstract-short" style="display: inline;"> Facial expression recognition (FER) has received considerable attention in computer vision, with "in-the-wild" environments such as human-computer interaction. However, FER images contain uncertainties such as occlusion, low resolution, pose variation, illumination variation, and subjectivity, which includes some expressions that do not match the target label. Consequently, little information is o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04218v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04218v1-abstract-full" style="display: none;"> Facial expression recognition (FER) has received considerable attention in computer vision, with "in-the-wild" environments such as human-computer interaction. However, FER images contain uncertainties such as occlusion, low resolution, pose variation, illumination variation, and subjectivity, which includes some expressions that do not match the target label. Consequently, little information is obtained from a noisy single image and it is not trusted. This could significantly degrade the performance of the FER task. To address this issue, we propose a batch transformer (BT), which consists of the proposed class batch attention (CBA) module, to prevent overfitting in noisy data and extract trustworthy information by training on features reflected from several images in a batch, rather than information from a single image. We also propose multi-level attention (MLA) to prevent overfitting the specific features by capturing correlations between each level. In this paper, we present a batch transformer network (BTN) that combines the above proposals. Experimental results on various FER benchmark datasets show that the proposed BTN consistently outperforms the state-ofthe-art in FER datasets. Representative results demonstrate the promise of the proposed BTN for FER. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04218v1-abstract-full').style.display = 'none'; document.getElementById('2407.04218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06424">arXiv:2406.06424</a> <span> [<a href="https://arxiv.org/pdf/2406.06424">pdf</a>, <a href="https://arxiv.org/format/2406.06424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Margin-aware Preference Optimization for Aligning Diffusion Models without Reference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+J">Jiwoo Hong</a>, <a href="/search/cs?searchtype=author&query=Paul%2C+S">Sayak Paul</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+N">Noah Lee</a>, <a href="/search/cs?searchtype=author&query=Rasul%2C+K">Kashif Rasul</a>, <a href="/search/cs?searchtype=author&query=Thorne%2C+J">James Thorne</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongheon Jeong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06424v1-abstract-short" style="display: inline;"> Modern alignment techniques based on human preferences, such as RLHF and DPO, typically employ divergence regularization relative to the reference model to ensure training stability. However, this often limits the flexibility of models during alignment, especially when there is a clear distributional discrepancy between the preference data and the reference model. In this paper, we focus on the al… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06424v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06424v1-abstract-full" style="display: none;"> Modern alignment techniques based on human preferences, such as RLHF and DPO, typically employ divergence regularization relative to the reference model to ensure training stability. However, this often limits the flexibility of models during alignment, especially when there is a clear distributional discrepancy between the preference data and the reference model. In this paper, we focus on the alignment of recent text-to-image diffusion models, such as Stable Diffusion XL (SDXL), and find that this "reference mismatch" is indeed a significant problem in aligning these models due to the unstructured nature of visual modalities: e.g., a preference for a particular stylistic aspect can easily induce such a discrepancy. Motivated by this observation, we propose a novel and memory-friendly preference alignment method for diffusion models that does not depend on any reference model, coined margin-aware preference optimization (MaPO). MaPO jointly maximizes the likelihood margin between the preferred and dispreferred image sets and the likelihood of the preferred sets, simultaneously learning general stylistic features and preferences. For evaluation, we introduce two new pairwise preference datasets, which comprise self-generated image pairs from SDXL, Pick-Style and Pick-Safety, simulating diverse scenarios of reference mismatch. Our experiments validate that MaPO can significantly improve alignment on Pick-Style and Pick-Safety and general preference alignment when used with Pick-a-Pic v2, surpassing the base SDXL and other existing methods. Our code, models, and datasets are publicly available via https://mapo-t2i.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06424v1-abstract-full').style.display = 'none'; document.getElementById('2406.06424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19675">arXiv:2405.19675</a> <span> [<a href="https://arxiv.org/pdf/2405.19675">pdf</a>, <a href="https://arxiv.org/format/2405.19675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Knowledge-grounded Adaptation Strategy for Vision-language Models: Building Unique Case-set for Screening Mammograms for Residents Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khan%2C+A+U">Aisha Urooj Khan</a>, <a href="/search/cs?searchtype=author&query=Garrett%2C+J">John Garrett</a>, <a href="/search/cs?searchtype=author&query=Bradshaw%2C+T">Tyler Bradshaw</a>, <a href="/search/cs?searchtype=author&query=Salkowski%2C+L">Lonie Salkowski</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J+J">Jiwoong Jason Jeong</a>, <a href="/search/cs?searchtype=author&query=Tariq%2C+A">Amara Tariq</a>, <a href="/search/cs?searchtype=author&query=Banerjee%2C+I">Imon Banerjee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19675v1-abstract-short" style="display: inline;"> A visual-language model (VLM) pre-trained on natural images and text pairs poses a significant barrier when applied to medical contexts due to domain shift. Yet, adapting or fine-tuning these VLMs for medical use presents considerable hurdles, including domain misalignment, limited access to extensive datasets, and high-class imbalances. Hence, there is a pressing need for strategies to effectivel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19675v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19675v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19675v1-abstract-full" style="display: none;"> A visual-language model (VLM) pre-trained on natural images and text pairs poses a significant barrier when applied to medical contexts due to domain shift. Yet, adapting or fine-tuning these VLMs for medical use presents considerable hurdles, including domain misalignment, limited access to extensive datasets, and high-class imbalances. Hence, there is a pressing need for strategies to effectively adapt these VLMs to the medical domain, as such adaptations would prove immensely valuable in healthcare applications. In this study, we propose a framework designed to adeptly tailor VLMs to the medical domain, employing selective sampling and hard-negative mining techniques for enhanced performance in retrieval tasks. We validate the efficacy of our proposed approach by implementing it across two distinct VLMs: the in-domain VLM (MedCLIP) and out-of-domain VLMs (ALBEF). We assess the performance of these models both in their original off-the-shelf state and after undergoing our proposed training strategies, using two extensive datasets containing mammograms and their corresponding reports. Our evaluation spans zero-shot, few-shot, and supervised scenarios. Through our approach, we observe a notable enhancement in Recall@K performance for the image-text retrieval task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19675v1-abstract-full').style.display = 'none'; document.getElementById('2405.19675v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16784">arXiv:2405.16784</a> <span> [<a href="https://arxiv.org/pdf/2405.16784">pdf</a>, <a href="https://arxiv.org/ps/2405.16784">ps</a>, <a href="https://arxiv.org/format/2405.16784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> The second-order zero differential uniformity of the swapped inverse functions over finite fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaeseong Jeong</a>, <a href="/search/cs?searchtype=author&query=Koo%2C+N">Namhun Koo</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+S">Soonhak Kwon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16784v2-abstract-short" style="display: inline;"> The Feistel Boomerang Connectivity Table (FBCT) was proposed as the feistel counterpart of the Boomerang Connectivity Table. The entries of the FBCT are actually related to the second-order zero differential spectrum. Recently, several results on the second-order zero differential uniformity of some functions were introduced. However, almost all of them were focused on power functions, and there a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16784v2-abstract-full').style.display = 'inline'; document.getElementById('2405.16784v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16784v2-abstract-full" style="display: none;"> The Feistel Boomerang Connectivity Table (FBCT) was proposed as the feistel counterpart of the Boomerang Connectivity Table. The entries of the FBCT are actually related to the second-order zero differential spectrum. Recently, several results on the second-order zero differential uniformity of some functions were introduced. However, almost all of them were focused on power functions, and there are only few results on non-power functions. In this paper, we investigate the second-order zero differential uniformity of the swapped inverse functions, which are functions obtained from swapping two points in the inverse function. We also present the second-order zero differential spectrum of the swapped inverse functions for certain cases. In particular, this paper is the first result to characterize classes of non-power functions with the second-order zero differential uniformity equal to 4, in even characteristic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16784v2-abstract-full').style.display = 'none'; document.getElementById('2405.16784v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14024">arXiv:2405.14024</a> <span> [<a href="https://arxiv.org/pdf/2405.14024">pdf</a>, <a href="https://arxiv.org/format/2405.14024">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Two Heads are Better Than One: Neural Networks Quantization with 2D Hilbert Curve-based Output Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Uss%2C+M">Mykhailo Uss</a>, <a href="/search/cs?searchtype=author&query=Yermolenko%2C+R">Ruslan Yermolenko</a>, <a href="/search/cs?searchtype=author&query=Kolodiazhna%2C+O">Olena Kolodiazhna</a>, <a href="/search/cs?searchtype=author&query=Shashko%2C+O">Oleksii Shashko</a>, <a href="/search/cs?searchtype=author&query=Safonov%2C+I">Ivan Safonov</a>, <a href="/search/cs?searchtype=author&query=Savin%2C+V">Volodymyr Savin</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+Y">Yoonjae Yeo</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Seowon Ji</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaeyun Jeong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14024v1-abstract-short" style="display: inline;"> Quantization is widely used to increase deep neural networks' (DNN) memory, computation, and power efficiency. Various techniques, such as post-training quantization and quantization-aware training, have been proposed to improve quantization quality. We introduce a novel approach for DNN quantization that uses a redundant representation of DNN's output. We represent the target quantity as a point… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14024v1-abstract-full').style.display = 'inline'; document.getElementById('2405.14024v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14024v1-abstract-full" style="display: none;"> Quantization is widely used to increase deep neural networks' (DNN) memory, computation, and power efficiency. Various techniques, such as post-training quantization and quantization-aware training, have been proposed to improve quantization quality. We introduce a novel approach for DNN quantization that uses a redundant representation of DNN's output. We represent the target quantity as a point on a 2D parametric curve. The DNN model is modified to predict 2D points that are mapped back to the target quantity at a post-processing stage. We demonstrate that this mapping can reduce quantization error. For the low-order parametric Hilbert curve, Depth-From-Stereo task, and two models represented by U-Net architecture and vision transformer, we achieved a quantization error reduction by about 5 times for the INT8 model at both CPU and DSP delegates. This gain comes with a minimal inference time increase (less than 7%). Our approach can be applied to other tasks, including segmentation, object detection, and key-points prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14024v1-abstract-full').style.display = 'none'; document.getElementById('2405.14024v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10536">arXiv:2405.10536</a> <span> [<a href="https://arxiv.org/pdf/2405.10536">pdf</a>, <a href="https://arxiv.org/format/2405.10536">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Time-Varying Constraint-Aware Reinforcement Learning for Energy Storage Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaeik Jeong</a>, <a href="/search/cs?searchtype=author&query=Ku%2C+T">Tai-Yeon Ku</a>, <a href="/search/cs?searchtype=author&query=Park%2C+W">Wan-Ki Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10536v1-abstract-short" style="display: inline;"> Energy storage devices, such as batteries, thermal energy storages, and hydrogen systems, can help mitigate climate change by ensuring a more stable and sustainable power supply. To maximize the effectiveness of such energy storage, determining the appropriate charging and discharging amounts for each time period is crucial. Reinforcement learning is preferred over traditional optimization for the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10536v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10536v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10536v1-abstract-full" style="display: none;"> Energy storage devices, such as batteries, thermal energy storages, and hydrogen systems, can help mitigate climate change by ensuring a more stable and sustainable power supply. To maximize the effectiveness of such energy storage, determining the appropriate charging and discharging amounts for each time period is crucial. Reinforcement learning is preferred over traditional optimization for the control of energy storage due to its ability to adapt to dynamic and complex environments. However, the continuous nature of charging and discharging levels in energy storage poses limitations for discrete reinforcement learning, and time-varying feasible charge-discharge range based on state of charge (SoC) variability also limits the conventional continuous reinforcement learning. In this paper, we propose a continuous reinforcement learning approach that takes into account the time-varying feasible charge-discharge range. An additional objective function was introduced for learning the feasible action range for each time period, supplementing the objectives of training the actor for policy learning and the critic for value learning. This actively promotes the utilization of energy storage by preventing them from getting stuck in suboptimal states, such as continuous full charging or discharging. This is achieved through the enforcement of the charging and discharging levels into the feasible action range. The experimental results demonstrated that the proposed method further maximized the effectiveness of energy storage by actively enhancing its utilization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10536v1-abstract-full').style.display = 'none'; document.getElementById('2405.10536v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2024 Workshop: Tackling Climate Change with Machine Learning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19007">arXiv:2404.19007</a> <span> [<a href="https://arxiv.org/pdf/2404.19007">pdf</a>, <a href="https://arxiv.org/format/2404.19007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> How Did We Get Here? Summarizing Conversation Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+Y">Yilun Hua</a>, <a href="/search/cs?searchtype=author&query=Chernogor%2C+N">Nicholas Chernogor</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuzhe Gu</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+S+J">Seoyeon Julie Jeong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M">Miranda Luo</a>, <a href="/search/cs?searchtype=author&query=Danescu-Niculescu-Mizil%2C+C">Cristian Danescu-Niculescu-Mizil</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19007v1-abstract-short" style="display: inline;"> Throughout a conversation, the way participants interact with each other is in constant flux: their tones may change, they may resort to different strategies to convey their points, or they might alter their interaction patterns. An understanding of these dynamics can complement that of the actual facts and opinions discussed, offering a more holistic view of the trajectory of the conversation: ho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19007v1-abstract-full').style.display = 'inline'; document.getElementById('2404.19007v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19007v1-abstract-full" style="display: none;"> Throughout a conversation, the way participants interact with each other is in constant flux: their tones may change, they may resort to different strategies to convey their points, or they might alter their interaction patterns. An understanding of these dynamics can complement that of the actual facts and opinions discussed, offering a more holistic view of the trajectory of the conversation: how it arrived at its current state and where it is likely heading. In this work, we introduce the task of summarizing the dynamics of conversations, by constructing a dataset of human-written summaries, and exploring several automated baselines. We evaluate whether such summaries can capture the trajectory of conversations via an established downstream task: forecasting whether an ongoing conversation will eventually derail into toxic behavior. We show that they help both humans and automated systems with this forecasting task. Humans make predictions three times faster, and with greater confidence, when reading the summaries than when reading the transcripts. Furthermore, automated forecasting systems are more accurate when constructing, and then predicting based on, summaries of conversation dynamics, compared to directly predicting on the transcripts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19007v1-abstract-full').style.display = 'none'; document.getElementById('2404.19007v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in the Proceedings of NAACL 2024. Data available in ConvoKit https://convokit.cornell.edu/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16484">arXiv:2404.16484</a> <span> [<a href="https://arxiv.org/pdf/2404.16484">pdf</a>, <a href="https://arxiv.org/format/2404.16484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Real-Time 4K Super-Resolution of Compressed AVIF Images. AIS 2024 Challenge Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Conde%2C+M+V">Marcos V. Conde</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhijun Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wen Li</a>, <a href="/search/cs?searchtype=author&query=Stejerean%2C+C">Cosmin Stejerean</a>, <a href="/search/cs?searchtype=author&query=Katsavounidis%2C+I">Ioannis Katsavounidis</a>, <a href="/search/cs?searchtype=author&query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kihwan Yoon</a>, <a href="/search/cs?searchtype=author&query=Gankhuyag%2C+G">Ganzorig Gankhuyag</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jiangtao Lv</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Long Sun</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jinshan Pan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J">Jiangxin Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jinhui Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyuan Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hao Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+C">Chenyang Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianle Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huaian Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yi Jin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Menghan Zhou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yiqiang Yan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Si Gao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Biao Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shaoli Liu</a> , et al. (50 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16484v1-abstract-short" style="display: inline;"> This paper introduces a novel benchmark as part of the AIS 2024 Real-Time Image Super-Resolution (RTSR) Challenge, which aims to upscale compressed images from 540p to 4K resolution (4x factor) in real-time on commercial GPUs. For this, we use a diverse test set containing a variety of 4K images ranging from digital art to gaming and photography. The images are compressed using the modern AVIF cod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16484v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16484v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16484v1-abstract-full" style="display: none;"> This paper introduces a novel benchmark as part of the AIS 2024 Real-Time Image Super-Resolution (RTSR) Challenge, which aims to upscale compressed images from 540p to 4K resolution (4x factor) in real-time on commercial GPUs. For this, we use a diverse test set containing a variety of 4K images ranging from digital art to gaming and photography. The images are compressed using the modern AVIF codec, instead of JPEG. All the proposed methods improve PSNR fidelity over Lanczos interpolation, and process images under 10ms. Out of the 160 participants, 25 teams submitted their code and models. The solutions present novel designs tailored for memory-efficiency and runtime on edge devices. This survey describes the best solutions for real-time SR of compressed high-resolution images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16484v1-abstract-full').style.display = 'none'; document.getElementById('2404.16484v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024, AI for Streaming (AIS) Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15707">arXiv:2404.15707</a> <span> [<a href="https://arxiv.org/pdf/2404.15707">pdf</a>, <a href="https://arxiv.org/format/2404.15707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ESR-NeRF: Emissive Source Reconstruction Using LDR Multi-view Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jinseo Jeong</a>, <a href="/search/cs?searchtype=author&query=Koo%2C+J">Junseo Koo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qimeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+G">Gunhee Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15707v2-abstract-short" style="display: inline;"> Existing NeRF-based inverse rendering methods suppose that scenes are exclusively illuminated by distant light sources, neglecting the potential influence of emissive sources within a scene. In this work, we confront this limitation using LDR multi-view images captured with emissive sources turned on and off. Two key issues must be addressed: 1) ambiguity arising from the limited dynamic range alo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15707v2-abstract-full').style.display = 'inline'; document.getElementById('2404.15707v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15707v2-abstract-full" style="display: none;"> Existing NeRF-based inverse rendering methods suppose that scenes are exclusively illuminated by distant light sources, neglecting the potential influence of emissive sources within a scene. In this work, we confront this limitation using LDR multi-view images captured with emissive sources turned on and off. Two key issues must be addressed: 1) ambiguity arising from the limited dynamic range along with unknown lighting details, and 2) the expensive computational cost in volume rendering to backtrace the paths leading to final object colors. We present a novel approach, ESR-NeRF, leveraging neural networks as learnable functions to represent ray-traced fields. By training networks to satisfy light transport segments, we regulate outgoing radiances, progressively identifying emissive sources while being aware of reflection areas. The results on scenes encompassing emissive sources with various properties demonstrate the superiority of ESR-NeRF in qualitative and quantitative ways. Our approach also extends its applicability to the scenes devoid of emissive sources, achieving lower CD metrics on the DTU dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15707v2-abstract-full').style.display = 'none'; document.getElementById('2404.15707v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08135">arXiv:2404.08135</a> <span> [<a href="https://arxiv.org/pdf/2404.08135">pdf</a>, <a href="https://arxiv.org/format/2404.08135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SciFlow: Empowering Lightweight Optical Flow Models with Self-Cleaning Iterations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J+M">Jamie Menjay Lin</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jisoo Jeong</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Hong Cai</a>, <a href="/search/cs?searchtype=author&query=Garrepalli%2C+R">Risheek Garrepalli</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Porikli%2C+F">Fatih Porikli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08135v1-abstract-short" style="display: inline;"> Optical flow estimation is crucial to a variety of vision tasks. Despite substantial recent advancements, achieving real-time on-device optical flow estimation remains a complex challenge. First, an optical flow model must be sufficiently lightweight to meet computation and memory constraints to ensure real-time performance on devices. Second, the necessity for real-time on-device operation impose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08135v1-abstract-full').style.display = 'inline'; document.getElementById('2404.08135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08135v1-abstract-full" style="display: none;"> Optical flow estimation is crucial to a variety of vision tasks. Despite substantial recent advancements, achieving real-time on-device optical flow estimation remains a complex challenge. First, an optical flow model must be sufficiently lightweight to meet computation and memory constraints to ensure real-time performance on devices. Second, the necessity for real-time on-device operation imposes constraints that weaken the model's capacity to adequately handle ambiguities in flow estimation, thereby intensifying the difficulty of preserving flow accuracy. This paper introduces two synergistic techniques, Self-Cleaning Iteration (SCI) and Regression Focal Loss (RFL), designed to enhance the capabilities of optical flow models, with a focus on addressing optical flow regression ambiguities. These techniques prove particularly effective in mitigating error propagation, a prevalent issue in optical flow models that employ iterative refinement. Notably, these techniques add negligible to zero overhead in model parameters and inference latency, thereby preserving real-time on-device efficiency. The effectiveness of our proposed SCI and RFL techniques, collectively referred to as SciFlow for brevity, is demonstrated across two distinct lightweight optical flow model architectures in our experiments. Remarkably, SciFlow enables substantial reduction in error metrics (EPE and Fl-all) over the baseline models by up to 6.3% and 10.5% for in-domain scenarios and by up to 6.2% and 13.5% for cross-domain scenarios on the Sintel and KITTI 2015 datasets, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08135v1-abstract-full').style.display = 'none'; document.getElementById('2404.08135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPRW 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07431">arXiv:2404.07431</a> <span> [<a href="https://arxiv.org/pdf/2404.07431">pdf</a>, <a href="https://arxiv.org/format/2404.07431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Parameterized Fast and Safe Tracking (FaSTrack) using Deepreach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+H+J">Hyun Joe Jeong</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+S">Somil Bansal</a>, <a href="/search/cs?searchtype=author&query=Herbert%2C+S">Sylvia Herbert</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07431v1-abstract-short" style="display: inline;"> Fast and Safe Tracking (FaSTrack) is a modular framework that provides safety guarantees while planning and executing trajectories in real time via value functions of Hamilton-Jacobi (HJ) reachability. These value functions are computed through dynamic programming, which is notorious for being computationally inefficient. Moreover, the resulting trajectory does not adapt online to the environment,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07431v1-abstract-full').style.display = 'inline'; document.getElementById('2404.07431v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07431v1-abstract-full" style="display: none;"> Fast and Safe Tracking (FaSTrack) is a modular framework that provides safety guarantees while planning and executing trajectories in real time via value functions of Hamilton-Jacobi (HJ) reachability. These value functions are computed through dynamic programming, which is notorious for being computationally inefficient. Moreover, the resulting trajectory does not adapt online to the environment, such as sudden disturbances or obstacles. DeepReach is a scalable deep learning method to HJ reachability that allows parameterization of states, which opens up possibilities for online adaptation to various controls and disturbances. In this paper, we propose Parametric FaSTrack, which uses DeepReach to approximate a value function that parameterizes the control bounds of the planning model. The new framework can smoothly trade off between the navigation speed and the tracking error (therefore maneuverability) while guaranteeing obstacle avoidance in a priori unknown environments. We demonstrate our method through two examples and a benchmark comparison with existing methods, showing the safety, efficiency, and faster solution times of the framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07431v1-abstract-full').style.display = 'none'; document.getElementById('2404.07431v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures, 1 table, to be published in L4DC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.05218">arXiv:2404.05218</a> <span> [<a href="https://arxiv.org/pdf/2404.05218">pdf</a>, <a href="https://arxiv.org/format/2404.05218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-agent Long-term 3D Human Pose Forecasting via Interaction-aware Trajectory Conditioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaewoo Jeong</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Daehee Park</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.05218v1-abstract-short" style="display: inline;"> Human pose forecasting garners attention for its diverse applications. However, challenges in modeling the multi-modal nature of human motion and intricate interactions among agents persist, particularly with longer timescales and more agents. In this paper, we propose an interaction-aware trajectory-conditioned long-term multi-agent human pose forecasting model, utilizing a coarse-to-fine predict… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05218v1-abstract-full').style.display = 'inline'; document.getElementById('2404.05218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.05218v1-abstract-full" style="display: none;"> Human pose forecasting garners attention for its diverse applications. However, challenges in modeling the multi-modal nature of human motion and intricate interactions among agents persist, particularly with longer timescales and more agents. In this paper, we propose an interaction-aware trajectory-conditioned long-term multi-agent human pose forecasting model, utilizing a coarse-to-fine prediction approach: multi-modal global trajectories are initially forecasted, followed by respective local pose forecasts conditioned on each mode. In doing so, our Trajectory2Pose model introduces a graph-based agent-wise interaction module for a reciprocal forecast of local motion-conditioned global trajectory and trajectory-conditioned local pose. Our model effectively handles the multi-modality of human motion and the complexity of long-term multi-agent interactions, improving performance in complex environments. Furthermore, we address the lack of long-term (6s+) multi-agent (5+) datasets by constructing a new dataset from real-world images and 2D annotations, enabling a comprehensive evaluation of our proposed model. State-of-the-art prediction performance on both complex and simpler datasets confirms the generalized effectiveness of our method. The code is available at https://github.com/Jaewoo97/T2P. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05218v1-abstract-full').style.display = 'none'; document.getElementById('2404.05218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2024 CVPR Highlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01954">arXiv:2404.01954</a> <span> [<a href="https://arxiv.org/pdf/2404.01954">pdf</a>, <a href="https://arxiv.org/format/2404.01954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HyperCLOVA X Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yoo%2C+K+M">Kang Min Yoo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jaegeun Han</a>, <a href="/search/cs?searchtype=author&query=In%2C+S">Sookyo In</a>, <a href="/search/cs?searchtype=author&query=Jeon%2C+H">Heewon Jeon</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jisu Jeong</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jaewook Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyunwook Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyung-Min Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Munhyong Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sungju Kim</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+D">Donghyun Kwak</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+H">Hanock Kwak</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+S+J">Se Jung Kwon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Bado Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dongsoo Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gichang Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jooho Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+B">Baeseong Park</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+S">Seongjin Shin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Joonsang Yu</a>, <a href="/search/cs?searchtype=author&query=Baek%2C+S">Seolki Baek</a>, <a href="/search/cs?searchtype=author&query=Byeon%2C+S">Sumin Byeon</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+E">Eungsup Cho</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+D">Dooseok Choe</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jeesung Han</a> , et al. (371 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01954v2-abstract-short" style="display: inline;"> We introduce HyperCLOVA X, a family of large language models (LLMs) tailored to the Korean language and culture, along with competitive capabilities in English, math, and coding. HyperCLOVA X was trained on a balanced mix of Korean, English, and code data, followed by instruction-tuning with high-quality human-annotated datasets while abiding by strict safety guidelines reflecting our commitment t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01954v2-abstract-full').style.display = 'inline'; document.getElementById('2404.01954v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01954v2-abstract-full" style="display: none;"> We introduce HyperCLOVA X, a family of large language models (LLMs) tailored to the Korean language and culture, along with competitive capabilities in English, math, and coding. HyperCLOVA X was trained on a balanced mix of Korean, English, and code data, followed by instruction-tuning with high-quality human-annotated datasets while abiding by strict safety guidelines reflecting our commitment to responsible AI. The model is evaluated across various benchmarks, including comprehensive reasoning, knowledge, commonsense, factuality, coding, math, chatting, instruction-following, and harmlessness, in both Korean and English. HyperCLOVA X exhibits strong reasoning capabilities in Korean backed by a deep understanding of the language and cultural nuances. Further analysis of the inherent bilingual nature and its extension to multilingualism highlights the model's cross-lingual proficiency and strong generalization ability to untargeted languages, including machine translation between several language pairs and cross-lingual inference tasks. We believe that HyperCLOVA X can provide helpful guidance for regions or countries in developing their sovereign LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01954v2-abstract-full').style.display = 'none'; document.getElementById('2404.01954v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">44 pages; updated authors list and fixed author names</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01863">arXiv:2404.01863</a> <span> [<a href="https://arxiv.org/pdf/2404.01863">pdf</a>, <a href="https://arxiv.org/format/2404.01863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Confidence-aware Reward Optimization for Fine-tuning Text-to-Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyuyoung Kim</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongheon Jeong</a>, <a href="/search/cs?searchtype=author&query=An%2C+M">Minyong An</a>, <a href="/search/cs?searchtype=author&query=Ghavamzadeh%2C+M">Mohammad Ghavamzadeh</a>, <a href="/search/cs?searchtype=author&query=Dvijotham%2C+K">Krishnamurthy Dvijotham</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+J">Jinwoo Shin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kimin Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01863v1-abstract-short" style="display: inline;"> Fine-tuning text-to-image models with reward functions trained on human feedback data has proven effective for aligning model behavior with human intent. However, excessive optimization with such reward models, which serve as mere proxy objectives, can compromise the performance of fine-tuned models, a phenomenon known as reward overoptimization. To investigate this issue in depth, we introduce th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01863v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01863v1-abstract-full" style="display: none;"> Fine-tuning text-to-image models with reward functions trained on human feedback data has proven effective for aligning model behavior with human intent. However, excessive optimization with such reward models, which serve as mere proxy objectives, can compromise the performance of fine-tuned models, a phenomenon known as reward overoptimization. To investigate this issue in depth, we introduce the Text-Image Alignment Assessment (TIA2) benchmark, which comprises a diverse collection of text prompts, images, and human annotations. Our evaluation of several state-of-the-art reward models on this benchmark reveals their frequent misalignment with human assessment. We empirically demonstrate that overoptimization occurs notably when a poorly aligned reward model is used as the fine-tuning objective. To address this, we propose TextNorm, a simple method that enhances alignment based on a measure of reward model confidence estimated across a set of semantically contrastive text prompts. We demonstrate that incorporating the confidence-calibrated rewards in fine-tuning effectively reduces overoptimization, resulting in twice as many wins in human evaluation for text-image alignment compared against the baseline reward models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01863v1-abstract-full').style.display = 'none'; document.getElementById('2404.01863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19904">arXiv:2403.19904</a> <span> [<a href="https://arxiv.org/pdf/2403.19904">pdf</a>, <a href="https://arxiv.org/format/2403.19904">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fully Geometric Panoramic Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junho Kim</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jiwon Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y+M">Young Min Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19904v1-abstract-short" style="display: inline;"> We introduce a lightweight and accurate localization method that only utilizes the geometry of 2D-3D lines. Given a pre-captured 3D map, our approach localizes a panorama image, taking advantage of the holistic 360 view. The system mitigates potential privacy breaches or domain discrepancies by avoiding trained or hand-crafted visual descriptors. However, as lines alone can be ambiguous, we expres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19904v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19904v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19904v1-abstract-full" style="display: none;"> We introduce a lightweight and accurate localization method that only utilizes the geometry of 2D-3D lines. Given a pre-captured 3D map, our approach localizes a panorama image, taking advantage of the holistic 360 view. The system mitigates potential privacy breaches or domain discrepancies by avoiding trained or hand-crafted visual descriptors. However, as lines alone can be ambiguous, we express distinctive yet compact spatial contexts from relationships between lines, namely the dominant directions of parallel lines and the intersection between non-parallel lines. The resulting representations are efficient in processing time and memory compared to conventional visual descriptor-based methods. Given the groups of dominant line directions and their intersections, we accelerate the search process to test thousands of pose candidates in less than a millisecond without sacrificing accuracy. We empirically show that the proposed 2D-3D matching can localize panoramas for challenging scenes with similar structures, dramatic domain shifts or illumination changes. Our fully geometric approach does not involve extensive parameter tuning or neural network training, making it a practical algorithm that can be readily deployed in the real world. Project page including the code is available through this link: https://82magnolia.github.io/fgpl/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19904v1-abstract-full').style.display = 'none'; document.getElementById('2403.19904v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.18092">arXiv:2403.18092</a> <span> [<a href="https://arxiv.org/pdf/2403.18092">pdf</a>, <a href="https://arxiv.org/format/2403.18092">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OCAI: Improving Optical Flow Estimation by Occlusion and Consistency Aware Interpolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jisoo Jeong</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Hong Cai</a>, <a href="/search/cs?searchtype=author&query=Garrepalli%2C+R">Risheek Garrepalli</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J+M">Jamie Menjay Lin</a>, <a href="/search/cs?searchtype=author&query=Hayat%2C+M">Munawar Hayat</a>, <a href="/search/cs?searchtype=author&query=Porikli%2C+F">Fatih Porikli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.18092v1-abstract-short" style="display: inline;"> The scarcity of ground-truth labels poses one major challenge in developing optical flow estimation models that are both generalizable and robust. While current methods rely on data augmentation, they have yet to fully exploit the rich information available in labeled video sequences. We propose OCAI, a method that supports robust frame interpolation by generating intermediate video frames alongsi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18092v1-abstract-full').style.display = 'inline'; document.getElementById('2403.18092v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.18092v1-abstract-full" style="display: none;"> The scarcity of ground-truth labels poses one major challenge in developing optical flow estimation models that are both generalizable and robust. While current methods rely on data augmentation, they have yet to fully exploit the rich information available in labeled video sequences. We propose OCAI, a method that supports robust frame interpolation by generating intermediate video frames alongside optical flows in between. Utilizing a forward warping approach, OCAI employs occlusion awareness to resolve ambiguities in pixel values and fills in missing values by leveraging the forward-backward consistency of optical flows. Additionally, we introduce a teacher-student style semi-supervised learning method on top of the interpolated frames. Using a pair of unlabeled frames and the teacher model's predicted optical flow, we generate interpolated frames and flows to train a student model. The teacher's weights are maintained using Exponential Moving Averaging of the student. Our evaluations demonstrate perceptually superior interpolation quality and enhanced optical flow accuracy on established benchmarks such as Sintel and KITTI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18092v1-abstract-full').style.display = 'none'; document.getElementById('2403.18092v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12953">arXiv:2403.12953</a> <span> [<a href="https://arxiv.org/pdf/2403.12953">pdf</a>, <a href="https://arxiv.org/format/2403.12953">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FutureDepth: Learning to Predict the Future Improves Video Depth Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yasarla%2C+R">Rajeev Yasarla</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+M+K">Manish Kumar Singh</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Hong Cai</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yunxiao Shi</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jisoo Jeong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yinhao Zhu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shizhong Han</a>, <a href="/search/cs?searchtype=author&query=Garrepalli%2C+R">Risheek Garrepalli</a>, <a href="/search/cs?searchtype=author&query=Porikli%2C+F">Fatih Porikli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12953v2-abstract-short" style="display: inline;"> In this paper, we propose a novel video depth estimation approach, FutureDepth, which enables the model to implicitly leverage multi-frame and motion cues to improve depth estimation by making it learn to predict the future at training. More specifically, we propose a future prediction network, F-Net, which takes the features of multiple consecutive frames and is trained to predict multi-frame fea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12953v2-abstract-full').style.display = 'inline'; document.getElementById('2403.12953v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12953v2-abstract-full" style="display: none;"> In this paper, we propose a novel video depth estimation approach, FutureDepth, which enables the model to implicitly leverage multi-frame and motion cues to improve depth estimation by making it learn to predict the future at training. More specifically, we propose a future prediction network, F-Net, which takes the features of multiple consecutive frames and is trained to predict multi-frame features one time step ahead iteratively. In this way, F-Net learns the underlying motion and correspondence information, and we incorporate its features into the depth decoding process. Additionally, to enrich the learning of multiframe correspondence cues, we further leverage a reconstruction network, R-Net, which is trained via adaptively masked auto-encoding of multiframe feature volumes. At inference time, both F-Net and R-Net are used to produce queries to work with the depth decoder, as well as a final refinement network. Through extensive experiments on several benchmarks, i.e., NYUDv2, KITTI, DDAD, and Sintel, which cover indoor, driving, and open-domain scenarios, we show that FutureDepth significantly improves upon baseline models, outperforms existing video depth estimation methods, and sets new state-of-the-art (SOTA) accuracy. Furthermore, FutureDepth is more efficient than existing SOTA video depth estimation models and has similar latencies when comparing to monocular models <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12953v2-abstract-full').style.display = 'none'; document.getElementById('2403.12953v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10052">arXiv:2403.10052</a> <span> [<a href="https://arxiv.org/pdf/2403.10052">pdf</a>, <a href="https://arxiv.org/format/2403.10052">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> T4P: Test-Time Training of Trajectory Prediction via Masked Autoencoder and Actor-specific Token Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Daehee Park</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaeseok Jeong</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+S">Sung-Hoon Yoon</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaewoo Jeong</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10052v1-abstract-short" style="display: inline;"> Trajectory prediction is a challenging problem that requires considering interactions among multiple actors and the surrounding environment. While data-driven approaches have been used to address this complex problem, they suffer from unreliable predictions under distribution shifts during test time. Accordingly, several online learning methods have been proposed using regression loss from the gro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10052v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10052v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10052v1-abstract-full" style="display: none;"> Trajectory prediction is a challenging problem that requires considering interactions among multiple actors and the surrounding environment. While data-driven approaches have been used to address this complex problem, they suffer from unreliable predictions under distribution shifts during test time. Accordingly, several online learning methods have been proposed using regression loss from the ground truth of observed data leveraging the auto-labeling nature of trajectory prediction task. We mainly tackle the following two issues. First, previous works underfit and overfit as they only optimize the last layer of the motion decoder. To this end, we employ the masked autoencoder (MAE) for representation learning to encourage complex interaction modeling in shifted test distribution for updating deeper layers. Second, utilizing the sequential nature of driving data, we propose an actor-specific token memory that enables the test-time learning of actor-wise motion characteristics. Our proposed method has been validated across various challenging cross-dataset distribution shift scenarios including nuScenes, Lyft, Waymo, and Interaction. Our method surpasses the performance of existing state-of-the-art online learning methods in terms of both prediction accuracy and computational efficiency. The code is available at https://github.com/daeheepark/T4P. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10052v1-abstract-full').style.display = 'none'; document.getElementById('2403.10052v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08187">arXiv:2403.08187</a> <span> [<a href="https://arxiv.org/pdf/2403.08187">pdf</a>, <a href="https://arxiv.org/format/2403.08187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Automatic Speech Recognition (ASR) for the Diagnosis of pronunciation of Speech Sound Disorders in Korean children </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ahn%2C+T">Taekyung Ahn</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yeonjung Hong</a>, <a href="/search/cs?searchtype=author&query=Im%2C+Y">Younggon Im</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D+H">Do Hyung Kim</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+D">Dayoung Kang</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J+W">Joo Won Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J+W">Jae Won Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M+J">Min Jung Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+A">Ah-ra Cho</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+D">Dae-Hyun Jang</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+H">Hosung Nam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08187v1-abstract-short" style="display: inline;"> This study presents a model of automatic speech recognition (ASR) designed to diagnose pronunciation issues in children with speech sound disorders (SSDs) to replace manual transcriptions in clinical procedures. Since ASR models trained for general purposes primarily predict input speech into real words, employing a well-known high-performance ASR model for evaluating pronunciation in children wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08187v1-abstract-full').style.display = 'inline'; document.getElementById('2403.08187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08187v1-abstract-full" style="display: none;"> This study presents a model of automatic speech recognition (ASR) designed to diagnose pronunciation issues in children with speech sound disorders (SSDs) to replace manual transcriptions in clinical procedures. Since ASR models trained for general purposes primarily predict input speech into real words, employing a well-known high-performance ASR model for evaluating pronunciation in children with SSDs is impractical. We fine-tuned the wav2vec 2.0 XLS-R model to recognize speech as pronounced rather than as existing words. The model was fine-tuned with a speech dataset from 137 children with inadequate speech production pronouncing 73 Korean words selected for actual clinical diagnosis. The model's predictions of the pronunciations of the words matched the human annotations with about 90% accuracy. While the model still requires improvement in recognizing unclear pronunciation, this study demonstrates that ASR models can streamline complex pronunciation error diagnostic procedures in clinical fields. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08187v1-abstract-full').style.display = 'none'; document.getElementById('2403.08187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 2 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.07371">arXiv:2403.07371</a> <span> [<a href="https://arxiv.org/pdf/2403.07371">pdf</a>, <a href="https://arxiv.org/format/2403.07371">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Time-Efficient and Identity-Consistent Virtual Try-On Using A Variant of Altered Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dam%2C+P">Phuong Dam</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jihoon Jeong</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+A">Anh Tran</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Daeyoung Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.07371v3-abstract-short" style="display: inline;"> This study discusses the critical issues of Virtual Try-On in contemporary e-commerce and the prospective metaverse, emphasizing the challenges of preserving intricate texture details and distinctive features of the target person and the clothes in various scenarios, such as clothing texture and identity characteristics like tattoos or accessories. In addition to the fidelity of the synthesized im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07371v3-abstract-full').style.display = 'inline'; document.getElementById('2403.07371v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.07371v3-abstract-full" style="display: none;"> This study discusses the critical issues of Virtual Try-On in contemporary e-commerce and the prospective metaverse, emphasizing the challenges of preserving intricate texture details and distinctive features of the target person and the clothes in various scenarios, such as clothing texture and identity characteristics like tattoos or accessories. In addition to the fidelity of the synthesized images, the efficiency of the synthesis process presents a significant hurdle. Various existing approaches are explored, highlighting the limitations and unresolved aspects, e.g., identity information omission, uncontrollable artifacts, and low synthesis speed. It then proposes a novel diffusion-based solution that addresses garment texture preservation and user identity retention during virtual try-on. The proposed network comprises two primary modules - a warping module aligning clothing with individual features and a try-on module refining the attire and generating missing parts integrated with a mask-aware post-processing technique ensuring the integrity of the individual's identity. It demonstrates impressive results, surpassing the state-of-the-art in speed by nearly 20 times during inference, with superior fidelity in qualitative assessments. Quantitative evaluations confirm comparable performance with the recent SOTA method on the VITON-HD and Dresscode datasets. We named our model Fast and Identity Preservation Virtual TryON (FIP-VITON). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07371v3-abstract-full').style.display = 'none'; document.getElementById('2403.07371v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.06471">arXiv:2403.06471</a> <span> [<a href="https://arxiv.org/pdf/2403.06471">pdf</a>, <a href="https://arxiv.org/format/2403.06471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Toward Robust Canine Cardiac Diagnosis: Deep Prototype Alignment Network-Based Few-Shot Segmentation in Veterinary Medicine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oh%2C+J">Jun-Young Oh</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+I">In-Gyu Lee</a>, <a href="/search/cs?searchtype=author&query=Kam%2C+T">Tae-Eui Kam</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Ji-Hoon Jeong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.06471v1-abstract-short" style="display: inline;"> In the cutting-edge domain of medical artificial intelligence (AI), remarkable advances have been achieved in areas such as diagnosis, prediction, and therapeutic interventions. Despite these advances, the technology for image segmentation faces the significant barrier of having to produce extensively annotated datasets. To address this challenge, few-shot segmentation (FSS) has been recognized as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06471v1-abstract-full').style.display = 'inline'; document.getElementById('2403.06471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.06471v1-abstract-full" style="display: none;"> In the cutting-edge domain of medical artificial intelligence (AI), remarkable advances have been achieved in areas such as diagnosis, prediction, and therapeutic interventions. Despite these advances, the technology for image segmentation faces the significant barrier of having to produce extensively annotated datasets. To address this challenge, few-shot segmentation (FSS) has been recognized as one of the innovative solutions. Although most of the FSS research has focused on human health care, its application in veterinary medicine, particularly for pet care, remains largely limited. This study has focused on accurate segmentation of the heart and left atrial enlargement on canine chest radiographs using the proposed deep prototype alignment network (DPANet). The PANet architecture is adopted as the backbone model, and experiments are conducted using various encoders based on VGG-19, ResNet-18, and ResNet-50 to extract features. Experimental results demonstrate that the proposed DPANet achieves the highest performance. In the 2way-1shot scenario, it achieves the highest intersection over union (IoU) value of 0.6966, and in the 2way-5shot scenario, it achieves the highest IoU value of 0.797. The DPANet not only signifies a performance improvement, but also shows an improved training speed in the 2way-5shot scenario. These results highlight our model's exceptional capability as a trailblazing solution for segmenting the heart and left atrial enlargement in veterinary applications through FSS, setting a new benchmark in veterinary AI research, and demonstrating its superior potential to veterinary medicine advances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06471v1-abstract-full').style.display = 'none'; document.getElementById('2403.06471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.03642">arXiv:2403.03642</a> <span> [<a href="https://arxiv.org/pdf/2403.03642">pdf</a>, <a href="https://arxiv.org/format/2403.03642">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generative Active Learning with Variational Autoencoder for Radiology Data Generation in Veterinary Medicine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+I">In-Gyu Lee</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+J">Jun-Young Oh</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hee-Jung Yu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jae-Hwan Kim</a>, <a href="/search/cs?searchtype=author&query=Eom%2C+K">Ki-Dong Eom</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Ji-Hoon Jeong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.03642v1-abstract-short" style="display: inline;"> Recently, with increasing interest in pet healthcare, the demand for computer-aided diagnosis (CAD) systems in veterinary medicine has increased. The development of veterinary CAD has stagnated due to a lack of sufficient radiology data. To overcome the challenge, we propose a generative active learning framework based on a variational autoencoder. This approach aims to alleviate the scarcity of r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03642v1-abstract-full').style.display = 'inline'; document.getElementById('2403.03642v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.03642v1-abstract-full" style="display: none;"> Recently, with increasing interest in pet healthcare, the demand for computer-aided diagnosis (CAD) systems in veterinary medicine has increased. The development of veterinary CAD has stagnated due to a lack of sufficient radiology data. To overcome the challenge, we propose a generative active learning framework based on a variational autoencoder. This approach aims to alleviate the scarcity of reliable data for CAD systems in veterinary medicine. This study utilizes datasets comprising cardiomegaly radiograph data. After removing annotations and standardizing images, we employed a framework for data augmentation, which consists of a data generation phase and a query phase for filtering the generated data. The experimental results revealed that as the data generated through this framework was added to the training data of the generative model, the frechet inception distance consistently decreased from 84.14 to 50.75 on the radiograph. Subsequently, when the generated data were incorporated into the training of the classification model, the false positive of the confusion matrix also improved from 0.16 to 0.66 on the radiograph. The proposed framework has the potential to address the challenges of data scarcity in medical CAD, contributing to its advancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03642v1-abstract-full').style.display = 'none'; document.getElementById('2403.03642v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.03526">arXiv:2403.03526</a> <span> [<a href="https://arxiv.org/pdf/2403.03526">pdf</a>, <a href="https://arxiv.org/format/2403.03526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> FingerNet: EEG Decoding of A Fine Motor Imagery with Finger-tapping Task Based on A Deep Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Go%2C+Y">Young-Min Go</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Seong-Hyun Yu</a>, <a href="/search/cs?searchtype=author&query=Park%2C+H">Hyeong-Yeong Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Minji Lee</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Ji-Hoon Jeong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.03526v1-abstract-short" style="display: inline;"> Brain-computer interface (BCI) technology facilitates communication between the human brain and computers, primarily utilizing electroencephalography (EEG) signals to discern human intentions. Although EEG-based BCI systems have been developed for paralysis individuals, ongoing studies explore systems for speech imagery and motor imagery (MI). This study introduces FingerNet, a specialized network… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03526v1-abstract-full').style.display = 'inline'; document.getElementById('2403.03526v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.03526v1-abstract-full" style="display: none;"> Brain-computer interface (BCI) technology facilitates communication between the human brain and computers, primarily utilizing electroencephalography (EEG) signals to discern human intentions. Although EEG-based BCI systems have been developed for paralysis individuals, ongoing studies explore systems for speech imagery and motor imagery (MI). This study introduces FingerNet, a specialized network for fine MI classification, departing from conventional gross MI studies. The proposed FingerNet could extract spatial and temporal features from EEG signals, improving classification accuracy within the same hand. The experimental results demonstrated that performance showed significantly higher accuracy in classifying five finger-tapping tasks, encompassing thumb, index, middle, ring, and little finger movements. FingerNet demonstrated dominant performance compared to the conventional baseline models, EEGNet and DeepConvNet. The average accuracy for FingerNet was 0.3049, whereas EEGNet and DeepConvNet exhibited lower accuracies of 0.2196 and 0.2533, respectively. Statistical validation also demonstrates the predominance of FingerNet over baseline networks. For biased predictions, particularly for thumb and index classes, we led to the implementation of weighted cross-entropy and also adapted the weighted cross-entropy, a method conventionally employed to mitigate class imbalance. The proposed FingerNet involves optimizing network structure, improving performance, and exploring applications beyond fine MI. Moreover, the weighted Cross Entropy approach employed to address such biased predictions appears to have broader applicability and relevance across various domains involving multi-class classification tasks. We believe that effective execution of motor imagery can be achieved not only for fine MI, but also for local muscle MI <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03526v1-abstract-full').style.display = 'none'; document.getElementById('2403.03526v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages,5 figures, and 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.12974">arXiv:2402.12974</a> <span> [<a href="https://arxiv.org/pdf/2402.12974">pdf</a>, <a href="https://arxiv.org/format/2402.12974">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Visual Style Prompting with Swapping Self-Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaeseok Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junho Kim</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yunjey Choi</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gayoung Lee</a>, <a href="/search/cs?searchtype=author&query=Uh%2C+Y">Youngjung Uh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.12974v2-abstract-short" style="display: inline;"> In the evolving domain of text-to-image generation, diffusion models have emerged as powerful tools in content creation. Despite their remarkable capability, existing models still face challenges in achieving controlled generation with a consistent style, requiring costly fine-tuning or often inadequately transferring the visual elements due to content leakage. To address these challenges, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12974v2-abstract-full').style.display = 'inline'; document.getElementById('2402.12974v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.12974v2-abstract-full" style="display: none;"> In the evolving domain of text-to-image generation, diffusion models have emerged as powerful tools in content creation. Despite their remarkable capability, existing models still face challenges in achieving controlled generation with a consistent style, requiring costly fine-tuning or often inadequately transferring the visual elements due to content leakage. To address these challenges, we propose a novel approach, \ours, to produce a diverse range of images while maintaining specific style elements and nuances. During the denoising process, we keep the query from original features while swapping the key and value with those from reference features in the late self-attention layers. This approach allows for the visual style prompting without any fine-tuning, ensuring that generated images maintain a faithful style. Through extensive evaluation across various styles and text prompts, our method demonstrates superiority over existing approaches, best reflecting the style of the references and ensuring that resulting images match the text prompts most accurately. Our project page is available https://curryjung.github.io/VisualStylePrompt/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12974v2-abstract-full').style.display = 'none'; document.getElementById('2402.12974v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.08897">arXiv:2401.08897</a> <span> [<a href="https://arxiv.org/pdf/2401.08897">pdf</a>, <a href="https://arxiv.org/format/2401.08897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CFASL: Composite Factor-Aligned Symmetry Learning for Disentanglement in Variational AutoEncoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jung%2C+H">Hee-Jun Jung</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaehyoung Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kangil Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.08897v3-abstract-short" style="display: inline;"> Symmetries of input and latent vectors have provided valuable insights for disentanglement learning in VAEs. However, only a few works were proposed as an unsupervised method, and even these works require known factor information in the training data. We propose a novel method, Composite Factor-Aligned Symmetry Learning (CFASL), which is integrated into VAEs for learning symmetry-based disentangle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08897v3-abstract-full').style.display = 'inline'; document.getElementById('2401.08897v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.08897v3-abstract-full" style="display: none;"> Symmetries of input and latent vectors have provided valuable insights for disentanglement learning in VAEs. However, only a few works were proposed as an unsupervised method, and even these works require known factor information in the training data. We propose a novel method, Composite Factor-Aligned Symmetry Learning (CFASL), which is integrated into VAEs for learning symmetry-based disentanglement in unsupervised learning without any knowledge of the dataset factor information. CFASL incorporates three novel features for learning symmetry-based disentanglement: 1) Injecting inductive bias to align latent vector dimensions to factor-aligned symmetries within an explicit learnable symmetry code-book 2) Learning a composite symmetry to express unknown factors change between two random samples by learning factor-aligned symmetries within the codebook 3) Inducing a group equivariant encoder and decoder in training VAEs with the two conditions. In addition, we propose an extended evaluation metric for multi-factor changes in comparison to disentanglement evaluation in VAEs. In quantitative and in-depth qualitative analysis, CFASL demonstrates a significant improvement of disentanglement in single-factor change, and multi-factor change conditions compared to state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08897v3-abstract-full').style.display = 'none'; document.getElementById('2401.08897v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in TMLR 25 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15906">arXiv:2312.15906</a> <span> [<a href="https://arxiv.org/pdf/2312.15906">pdf</a>, <a href="https://arxiv.org/format/2312.15906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improving Transferability for Cross-domain Trajectory Prediction via Neural Stochastic Differential Equation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Daehee Park</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaewoo Jeong</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15906v1-abstract-short" style="display: inline;"> Multi-agent trajectory prediction is crucial for various practical applications, spurring the construction of many large-scale trajectory datasets, including vehicles and pedestrians. However, discrepancies exist among datasets due to external factors and data acquisition strategies. External factors include geographical differences and driving styles, while data acquisition strategies include dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15906v1-abstract-full').style.display = 'inline'; document.getElementById('2312.15906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15906v1-abstract-full" style="display: none;"> Multi-agent trajectory prediction is crucial for various practical applications, spurring the construction of many large-scale trajectory datasets, including vehicles and pedestrians. However, discrepancies exist among datasets due to external factors and data acquisition strategies. External factors include geographical differences and driving styles, while data acquisition strategies include data acquisition rate, history/prediction length, and detector/tracker error. Consequently, the proficient performance of models trained on large-scale datasets has limited transferability on other small-size datasets, bounding the utilization of existing large-scale datasets. To address this limitation, we propose a method based on continuous and stochastic representations of Neural Stochastic Differential Equations (NSDE) for alleviating discrepancies due to data acquisition strategy. We utilize the benefits of continuous representation for handling arbitrary time steps and the use of stochastic representation for handling detector/tracker errors. Additionally, we propose a dataset-specific diffusion network and its training framework to handle dataset-specific detection/tracking errors. The effectiveness of our method is validated against state-of-the-art trajectory prediction models on the popular benchmark datasets: nuScenes, Argoverse, Lyft, INTERACTION, and Waymo Open Motion Dataset (WOMD). Improvement in performance gain on various source and target dataset configurations shows the generalized competence of our approach in addressing cross-dataset discrepancies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15906v1-abstract-full').style.display = 'none'; document.getElementById('2312.15906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.09446">arXiv:2312.09446</a> <span> [<a href="https://arxiv.org/pdf/2312.09446">pdf</a>, <a href="https://arxiv.org/format/2312.09446">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Distributed Inference System for Detecting Task-wise Single Trial Event-Related Potential in Stream of Satellite Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sung-Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+H">Heon-Gyu Kwak</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Hyeon-Taek Han</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dae-Hyeok Lee</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Ji-Hoon Jeong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seong-Whan Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.09446v1-abstract-short" style="display: inline;"> Brain-computer interface (BCI) has garnered the significant attention for their potential in various applications, with event-related potential (ERP) performing a considerable role in BCI systems. This paper introduces a novel Distributed Inference System tailored for detecting task-wise single-trial ERPs in a stream of satellite images. Unlike traditional methodologies that employ a single model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09446v1-abstract-full').style.display = 'inline'; document.getElementById('2312.09446v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.09446v1-abstract-full" style="display: none;"> Brain-computer interface (BCI) has garnered the significant attention for their potential in various applications, with event-related potential (ERP) performing a considerable role in BCI systems. This paper introduces a novel Distributed Inference System tailored for detecting task-wise single-trial ERPs in a stream of satellite images. Unlike traditional methodologies that employ a single model for target detection, our system utilizes multiple models, each optimized for specific tasks, ensuring enhanced performance across varying image transition times and target onset times. Our experiments, conducted on four participants, employed two paradigms: the Normal paradigm and an AI paradigm with bounding boxes. Results indicate that our proposed system outperforms the conventional methods in both paradigms, achieving the highest $F_尾$ scores. Furthermore, including bounding boxes in the AI paradigm significantly improved target recognition. This study underscores the potential of our Distributed Inference System in advancing the field of ERP detection in satellite image streams. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09446v1-abstract-full').style.display = 'none'; document.getElementById('2312.09446v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jeong%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository