Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by --> <link rel="apple-touch-icon" sizes="180x180" href=""> <link rel="icon" type="image/png" sizes="32x32" href=""> <link rel="icon" type="image/png" sizes="16x16" href=""> <link rel="manifest" href=""> <link rel="mask-icon" href="" color="#b31b1b"> <link rel="shortcut icon" href=""> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src=""></script> <link rel="stylesheet" href="" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//'></script> <script src=""></script> <link rel="stylesheet" href="" /> <link rel="stylesheet" href="" /> <script src="" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src=""></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href=""><img src="" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="">member institutions</a>, and all contributors. <a href="">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="" aria-label="arxiv-logo"> <img src="" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action=""> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="">Help</a> | <a href="">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 163 results for author: <span class="mathjax">Kweon, S</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Kweon%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Kweon, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Kweon%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Kweon, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Kweon%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Kweon%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Kweon%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Kweon%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Kweon%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2410.05210</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Preserving Multi-Modal Capabilities of Pre-trained VLMs for Improving Vision-Linguistic Compositionality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oh%2C+Y">Youngtaek Oh</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J+W">Jae Won Cho</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dong-Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junmo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05210v1-abstract-short" style="display: inline;"> In this paper, we propose a new method to enhance compositional understanding in pre-trained vision and language models (VLMs) without sacrificing performance in zero-shot multi-modal tasks. Traditional fine-tuning approaches often improve compositional reasoning at the cost of degrading multi-modal capabilities, primarily due to the use of global hard negative (HN) loss, which contrasts global re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05210v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05210v1-abstract-full" style="display: none;"> In this paper, we propose a new method to enhance compositional understanding in pre-trained vision and language models (VLMs) without sacrificing performance in zero-shot multi-modal tasks. Traditional fine-tuning approaches often improve compositional reasoning at the cost of degrading multi-modal capabilities, primarily due to the use of global hard negative (HN) loss, which contrasts global representations of images and texts. This global HN loss pushes HN texts that are highly similar to the original ones, damaging the model's multi-modal representations. To overcome this limitation, we propose Fine-grained Selective Calibrated CLIP (FSC-CLIP), which integrates local hard negative loss and selective calibrated regularization. These innovations provide fine-grained negative supervision while preserving the model's representational integrity. Our extensive evaluations across diverse benchmarks for both compositionality and multi-modal tasks show that FSC-CLIP not only achieves compositionality on par with state-of-the-art models but also retains strong multi-modal capabilities. Code is available at: <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05210v1-abstract-full').style.display = 'none'; document.getElementById('2410.05210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 (Long, Main). Project page:</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2406.18898</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> 360 in the Wild: Dataset for Depth Prediction and View Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+K">Kibaek Park</a>, <a href="/search/cs?searchtype=author&query=Rameau%2C+F">Francois Rameau</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaesik Park</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18898v2-abstract-short" style="display: inline;"> The large abundance of perspective camera datasets facilitated the emergence of novel learning-based strategies for various tasks, such as camera localization, single image depth estimation, or view synthesis. However, panoramic or omnidirectional image datasets, including essential information, such as pose and depth, are mostly made with synthetic scenes. In this work, we introduce a large scale… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18898v2-abstract-full').style.display = 'inline'; document.getElementById('2406.18898v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18898v2-abstract-full" style="display: none;"> The large abundance of perspective camera datasets facilitated the emergence of novel learning-based strategies for various tasks, such as camera localization, single image depth estimation, or view synthesis. However, panoramic or omnidirectional image datasets, including essential information, such as pose and depth, are mostly made with synthetic scenes. In this work, we introduce a large scale 360$^{\circ}$ videos dataset in the wild. This dataset has been carefully scraped from the Internet and has been captured from various locations worldwide. Hence, this dataset exhibits very diversified environments (e.g., indoor and outdoor) and contexts (e.g., with and without moving objects). Each of the 25K images constituting our dataset is provided with its respective camera's pose and depth map. We illustrate the relevance of our dataset for two main tasks, namely, single image depth estimation and view synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18898v2-abstract-full').style.display = 'none'; document.getElementById('2406.18898v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2406.09388</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Spectrum of Visio-Linguistic Compositionality and Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oh%2C+Y">Youngtaek Oh</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+P">Pyunghwan Ahn</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jinhyung Kim</a>, <a href="/search/cs?searchtype=author&query=Song%2C+G">Gwangmo Song</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Soonyoung Lee</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junmo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09388v1-abstract-short" style="display: inline;"> Vision and language models (VLMs) such as CLIP have showcased remarkable zero-shot recognition abilities yet face challenges in visio-linguistic compositionality, particularly in linguistic comprehension and fine-grained image-text alignment. This paper explores the intricate relationship between compositionality and recognition -- two pivotal aspects of VLM capability. We conduct a comprehensive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09388v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09388v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09388v1-abstract-full" style="display: none;"> Vision and language models (VLMs) such as CLIP have showcased remarkable zero-shot recognition abilities yet face challenges in visio-linguistic compositionality, particularly in linguistic comprehension and fine-grained image-text alignment. This paper explores the intricate relationship between compositionality and recognition -- two pivotal aspects of VLM capability. We conduct a comprehensive evaluation of existing VLMs, covering both pre-training approaches aimed at recognition and the fine-tuning methods designed to improve compositionality. Our evaluation employs 12 benchmarks for compositionality, along with 21 zero-shot classification and two retrieval benchmarks for recognition. In our analysis from 274 CLIP model checkpoints, we reveal patterns and trade-offs that emerge between compositional understanding and recognition accuracy. Ultimately, this necessitates strategic efforts towards developing models that improve both capabilities, as well as the meticulous formulation of benchmarks for compositionality. We open our evaluation framework at <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09388v1-abstract-full').style.display = 'none'; document.getElementById('2406.09388v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPRW 2024 on 'What is Next in Multimodal Foundation Models?'. Code:</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2406.04517</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> FOX: Coverage-guided Fuzzing as Online Stochastic Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=She%2C+D">Dongdong She</a>, <a href="/search/cs?searchtype=author&query=Storek%2C+A">Adam Storek</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuchong Xie</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+S">Seoyoung Kweon</a>, <a href="/search/cs?searchtype=author&query=Srivastava%2C+P">Prashast Srivastava</a>, <a href="/search/cs?searchtype=author&query=Jana%2C+S">Suman Jana</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04517v1-abstract-short" style="display: inline;"> Fuzzing is an effective technique for discovering software vulnerabilities by generating random test inputs and executing them against the target program. However, fuzzing large and complex programs remains challenging due to difficulties in uncovering deeply hidden vulnerabilities. This paper addresses the limitations of existing coverage-guided fuzzers, focusing on the scheduler and mutator comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04517v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04517v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04517v1-abstract-full" style="display: none;"> Fuzzing is an effective technique for discovering software vulnerabilities by generating random test inputs and executing them against the target program. However, fuzzing large and complex programs remains challenging due to difficulties in uncovering deeply hidden vulnerabilities. This paper addresses the limitations of existing coverage-guided fuzzers, focusing on the scheduler and mutator components. Existing schedulers suffer from information sparsity and the inability to handle fine-grained feedback metrics. The mutators are agnostic of target program branches, leading to wasted computation and slower coverage exploration. To overcome these issues, we propose an end-to-end online stochastic control formulation for coverage-guided fuzzing. Our approach incorporates a novel scheduler and custom mutator that can adapt to branch logic, maximizing aggregate edge coverage achieved over multiple stages. The scheduler utilizes fine-grained branch distance measures to identify frontier branches, where new coverage is likely to be achieved. The mutator leverages branch distance information to perform efficient and targeted seed mutations, leading to robust progress with minimal overhead. We present FOX, a proof-of-concept implementation of our control-theoretic approach, and compare it to industry-standard coverage-guided fuzzers. 6 CPU-years of extensive evaluations on the FuzzBench dataset and complex real-world programs (a total of 38 test programs) demonstrate that FOX outperforms existing state-of-the-art fuzzers, achieving average coverage improvements up to 26.45% in real-world standalone programs and 6.59% in FuzzBench programs over the state-of-the-art AFL++. In addition, it uncovers 20 unique bugs in popular real-world applications including eight that are previously unknown, showcasing real-world security impact. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04517v1-abstract-full').style.display = 'none'; document.getElementById('2406.04517v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To Appear in Proceedings of the 2024 ACM SIGSAC Conference on Computer and Communications Security (CCS '24)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2406.02541</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Temporal Consistency in Video Editing by Reconstructing Videos with 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shin%2C+I">Inkyu Shin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qihang Yu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiaohui Shen</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang-Chieh Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02541v3-abstract-short" style="display: inline;"> Recent advancements in zero-shot video diffusion models have shown promise for text-driven video editing, but challenges remain in achieving high temporal consistency. To address this, we introduce Video-3DGS, a 3D Gaussian Splatting (3DGS)-based video refiner designed to enhance temporal consistency in zero-shot video editors. Our approach utilizes a two-stage 3D Gaussian optimizing process tailo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02541v3-abstract-full').style.display = 'inline'; document.getElementById('2406.02541v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02541v3-abstract-full" style="display: none;"> Recent advancements in zero-shot video diffusion models have shown promise for text-driven video editing, but challenges remain in achieving high temporal consistency. To address this, we introduce Video-3DGS, a 3D Gaussian Splatting (3DGS)-based video refiner designed to enhance temporal consistency in zero-shot video editors. Our approach utilizes a two-stage 3D Gaussian optimizing process tailored for editing dynamic monocular videos. In the first stage, Video-3DGS employs an improved version of COLMAP, referred to as MC-COLMAP, which processes original videos using a Masked and Clipped approach. For each video clip, MC-COLMAP generates the point clouds for dynamic foreground objects and complex backgrounds. These point clouds are utilized to initialize two sets of 3D Gaussians (Frg-3DGS and Bkg-3DGS) aiming to represent foreground and background views. Both foreground and background views are then merged with a 2D learnable parameter map to reconstruct full views. In the second stage, we leverage the reconstruction ability developed in the first stage to impose the temporal constraints on the video diffusion model. To demonstrate the efficacy of Video-3DGS on both stages, we conduct extensive experiments across two related tasks: Video Reconstruction and Video Editing. Video-3DGS trained with 3k iterations significantly improves video reconstruction quality (+3 PSNR, +7 PSNR increase) and training efficiency (x1.9, x4.5 times faster) over NeRF-based and 3DGS-based state-of-art methods on DAVIS dataset, respectively. Moreover, it enhances video editing by ensuring temporal consistency across 58 dynamic monocular videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02541v3-abstract-full').style.display = 'none'; document.getElementById('2406.02541v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page at</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2405.06673</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Overview of the EHRSQL 2024 Shared Task on Reliable Text-to-SQL Modeling on Electronic Health Records </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gyubok Lee</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+S">Sunjun Kweon</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+S">Seongsu Bae</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Edward Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.06673v2-abstract-short" style="display: inline;"> Electronic Health Records (EHRs) are relational databases that store the entire medical histories of patients within hospitals. They record numerous aspects of patients' medical care, from hospital admission and diagnosis to treatment and discharge. While EHRs are vital sources of clinical data, exploring them beyond a predefined set of queries requires skills in query languages like SQL. To make… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06673v2-abstract-full').style.display = 'inline'; document.getElementById('2405.06673v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.06673v2-abstract-full" style="display: none;"> Electronic Health Records (EHRs) are relational databases that store the entire medical histories of patients within hospitals. They record numerous aspects of patients' medical care, from hospital admission and diagnosis to treatment and discharge. While EHRs are vital sources of clinical data, exploring them beyond a predefined set of queries requires skills in query languages like SQL. To make information retrieval more accessible, one strategy is to build a question-answering system, possibly leveraging text-to-SQL models that can automatically translate natural language questions into corresponding SQL queries and use these queries to retrieve the answers. The EHRSQL 2024 shared task aims to advance and promote research in developing a question-answering system for EHRs using text-to-SQL modeling, capable of reliably providing requested answers to various healthcare professionals to improve their clinical work processes and satisfy their needs. Among more than 100 participants who applied to the shared task, eight teams were formed and completed the entire shared task requirement and demonstrated a wide range of methods to effectively solve this task. In this paper, we describe the task of reliable text-to-SQL modeling, the dataset, and the methods and results of the participants. We hope this shared task will spur further research and insights into developing reliable question-answering systems for EHRs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06673v2-abstract-full').style.display = 'none'; document.getElementById('2405.06673v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 6th Clinical Natural Language Processing Workshop at NAACL 2024; Minor Change from Camera-Ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2404.14616</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> What Makes A Video Radicalizing? Identifying Sources of Influence in QAnon Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ai%2C+L">Lin Ai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu-Wen Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yuwen Yu</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+S">Seoyoung Kweon</a>, <a href="/search/cs?searchtype=author&query=Hirschberg%2C+J">Julia Hirschberg</a>, <a href="/search/cs?searchtype=author&query=Levitan%2C+S+I">Sarah Ita Levitan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14616v1-abstract-short" style="display: inline;"> In recent years, radicalization is being increasingly attempted on video-sharing platforms. Previous studies have been proposed to identify online radicalization using generic social context analysis, without taking into account comprehensive viewer traits and how those can affect viewers' perception of radicalizing content. To address the challenge, we examine QAnon, a conspiracy-based radicalizi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14616v1-abstract-full').style.display = 'inline'; document.getElementById('2404.14616v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14616v1-abstract-full" style="display: none;"> In recent years, radicalization is being increasingly attempted on video-sharing platforms. Previous studies have been proposed to identify online radicalization using generic social context analysis, without taking into account comprehensive viewer traits and how those can affect viewers' perception of radicalizing content. To address the challenge, we examine QAnon, a conspiracy-based radicalizing group, and have designed a comprehensive questionnaire aiming to understand viewers' perceptions of QAnon videos. We outline the traits of viewers that QAnon videos are the most appealing to, and identify influential factors that impact viewers' perception of the videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14616v1-abstract-full').style.display = 'none'; document.getElementById('2404.14616v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2403.20225</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MTMMC: A Large-Scale Real-World Multi-Modal Camera Tracking Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Woo%2C+S">Sanghyun Woo</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Kwanyong Park</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+I">Inkyu Shin</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Myungchul Kim</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.20225v1-abstract-short" style="display: inline;"> Multi-target multi-camera tracking is a crucial task that involves identifying and tracking individuals over time using video streams from multiple cameras. This task has practical applications in various fields, such as visual surveillance, crowd behavior analysis, and anomaly detection. However, due to the difficulty and cost of collecting and labeling data, existing datasets for this task are e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.20225v1-abstract-full').style.display = 'inline'; document.getElementById('2403.20225v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.20225v1-abstract-full" style="display: none;"> Multi-target multi-camera tracking is a crucial task that involves identifying and tracking individuals over time using video streams from multiple cameras. This task has practical applications in various fields, such as visual surveillance, crowd behavior analysis, and anomaly detection. However, due to the difficulty and cost of collecting and labeling data, existing datasets for this task are either synthetically generated or artificially constructed within a controlled camera network setting, which limits their ability to model real-world dynamics and generalize to diverse camera configurations. To address this issue, we present MTMMC, a real-world, large-scale dataset that includes long video sequences captured by 16 multi-modal cameras in two different environments - campus and factory - across various time, weather, and season conditions. This dataset provides a challenging test-bed for studying multi-camera tracking under diverse real-world complexities and includes an additional input modality of spatially aligned and temporally synchronized RGB and thermal cameras, which enhances the accuracy of multi-camera tracking. MTMMC is a super-set of existing datasets, benefiting independent fields such as person detection, re-identification, and multiple object tracking. We provide baselines and new learning setups on this dataset and set the reference scores for future studies. The datasets, models, and test server will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.20225v1-abstract-full').style.display = 'none'; document.getElementById('2403.20225v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted on CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2403.19985</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stable Surface Regularization for Fast Few-Shot NeRF </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Joung%2C+B">Byeongin Joung</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Byeong-Uk Lee</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+J">Jaesung Choe</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+U">Ukcheol Shin</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+M">Minjun Kang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+T">Taeyeop Lee</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19985v1-abstract-short" style="display: inline;"> This paper proposes an algorithm for synthesizing novel views under few-shot setup. The main concept is to develop a stable surface regularization technique called Annealing Signed Distance Function (ASDF), which anneals the surface in a coarse-to-fine manner to accelerate convergence speed. We observe that the Eikonal loss - which is a widely known geometric regularization - requires dense traini… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19985v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19985v1-abstract-full" style="display: none;"> This paper proposes an algorithm for synthesizing novel views under few-shot setup. The main concept is to develop a stable surface regularization technique called Annealing Signed Distance Function (ASDF), which anneals the surface in a coarse-to-fine manner to accelerate convergence speed. We observe that the Eikonal loss - which is a widely known geometric regularization - requires dense training signal to shape different level-sets of SDF, leading to low-fidelity results under few-shot training. In contrast, the proposed surface regularization successfully reconstructs scenes and produce high-fidelity geometry with stable training. Our method is further accelerated by utilizing grid representation and monocular geometric priors. Finally, the proposed approach is up to 45 times faster than existing few-shot novel view synthesis methods, and it produces comparable results in the ScanNet dataset and NeRF-Real dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19985v1-abstract-full').style.display = 'none'; document.getElementById('2403.19985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3DV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2403.19150</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Understanding Dual BN In Hybrid Adversarial Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenshuang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+A">Axi Niu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junmo Kim</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19150v1-abstract-short" style="display: inline;"> There is a growing concern about applying batch normalization (BN) in adversarial training (AT), especially when the model is trained on both adversarial samples and clean samples (termed Hybrid-AT). With the assumption that adversarial and clean samples are from two different domains, a common practice in prior works is to adopt Dual BN, where BN and BN are used for adversarial and clean branches… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19150v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19150v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19150v1-abstract-full" style="display: none;"> There is a growing concern about applying batch normalization (BN) in adversarial training (AT), especially when the model is trained on both adversarial samples and clean samples (termed Hybrid-AT). With the assumption that adversarial and clean samples are from two different domains, a common practice in prior works is to adopt Dual BN, where BN and BN are used for adversarial and clean branches, respectively. A popular belief for motivating Dual BN is that estimating normalization statistics of this mixture distribution is challenging and thus disentangling it for normalization achieves stronger robustness. In contrast to this belief, we reveal that disentangling statistics plays a less role than disentangling affine parameters in model training. This finding aligns with prior work (Rebuffi et al., 2023), and we build upon their research for further investigations. We demonstrate that the domain gap between adversarial and clean samples is not very large, which is counter-intuitive considering the significant influence of adversarial perturbation on the model accuracy. We further propose a two-task hypothesis which serves as the empirical foundation and a unified framework for Hybrid-AT improvement. We also investigate Dual BN in test-time and reveal that affine parameters characterize the robustness during inference. Overall, our work sheds new light on understanding the mechanism of Dual BN in Hybrid-AT and its underlying justification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19150v1-abstract-full').style.display = 'none'; document.getElementById('2403.19150v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at TMLR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2403.18775</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ImageNet-D: Benchmarking Neural Network Robustness on Diffusion Synthetic Object </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenshuang Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+F">Fei Pan</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junmo Kim</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+C">Chengzhi Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.18775v1-abstract-short" style="display: inline;"> We establish rigorous benchmarks for visual perception robustness. Synthetic images such as ImageNet-C, ImageNet-9, and Stylized ImageNet provide specific type of evaluation over synthetic corruptions, backgrounds, and textures, yet those robustness benchmarks are restricted in specified variations and have low synthetic quality. In this work, we introduce generative model as a data source for syn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18775v1-abstract-full').style.display = 'inline'; document.getElementById('2403.18775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.18775v1-abstract-full" style="display: none;"> We establish rigorous benchmarks for visual perception robustness. Synthetic images such as ImageNet-C, ImageNet-9, and Stylized ImageNet provide specific type of evaluation over synthetic corruptions, backgrounds, and textures, yet those robustness benchmarks are restricted in specified variations and have low synthetic quality. In this work, we introduce generative model as a data source for synthesizing hard images that benchmark deep models' robustness. Leveraging diffusion models, we are able to generate images with more diversified backgrounds, textures, and materials than any prior work, where we term this benchmark as ImageNet-D. Experimental results show that ImageNet-D results in a significant accuracy drop to a range of vision models, from the standard ResNet visual classifier to the latest foundation models like CLIP and MiniGPT-4, significantly reducing their accuracy by up to 60\%. Our work suggests that diffusion models can be an effective source to test vision models. The code and dataset are available at <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18775v1-abstract-full').style.display = 'none'; document.getElementById('2403.18775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2403.01469</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> KorMedMCQA: Multi-Choice Question Answering Benchmark for Korean Healthcare Professional Licensing Examinations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kweon%2C+S">Sunjun Kweon</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+B">Byungjin Choi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minkyu Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+R+W">Rae Woong Park</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Edward Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.01469v2-abstract-short" style="display: inline;"> We introduce KorMedMCQA, the first Korean multiple-choice question answering (MCQA) benchmark derived from Korean healthcare professional licensing examinations, covering from the year 2012 to year 2023. This dataset consists of a selection of questions from the license examinations for doctors, nurses, and pharmacists, featuring a diverse array of subjects. We conduct baseline experiments on vari… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01469v2-abstract-full').style.display = 'inline'; document.getElementById('2403.01469v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.01469v2-abstract-full" style="display: none;"> We introduce KorMedMCQA, the first Korean multiple-choice question answering (MCQA) benchmark derived from Korean healthcare professional licensing examinations, covering from the year 2012 to year 2023. This dataset consists of a selection of questions from the license examinations for doctors, nurses, and pharmacists, featuring a diverse array of subjects. We conduct baseline experiments on various large language models, including proprietary/open-source, multilingual/Korean-additional pretrained, and clinical context pretrained models, highlighting the potential for further enhancements. We make our data publicly available on HuggingFace ( and provide a evaluation script via LM-Harness, inviting further exploration and advancement in Korean healthcare environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01469v2-abstract-full').style.display = 'none'; document.getElementById('2403.01469v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2402.16040</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EHRNoteQA: An LLM Benchmark for Real-World Clinical Practice Using Discharge Summaries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kweon%2C+S">Sunjun Kweon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jiyoun Kim</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+H">Heeyoung Kwak</a>, <a href="/search/cs?searchtype=author&query=Cha%2C+D">Dongchul Cha</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+H">Hangyul Yoon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kwanghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jeewon Yang</a>, <a href="/search/cs?searchtype=author&query=Won%2C+S">Seunghyun Won</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Edward Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.16040v5-abstract-short" style="display: inline;"> Discharge summaries in Electronic Health Records (EHRs) are crucial for clinical decision-making, but their length and complexity make information extraction challenging, especially when dealing with accumulated summaries across multiple patient admissions. Large Language Models (LLMs) show promise in addressing this challenge by efficiently analyzing vast and complex data. Existing benchmarks, ho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16040v5-abstract-full').style.display = 'inline'; document.getElementById('2402.16040v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.16040v5-abstract-full" style="display: none;"> Discharge summaries in Electronic Health Records (EHRs) are crucial for clinical decision-making, but their length and complexity make information extraction challenging, especially when dealing with accumulated summaries across multiple patient admissions. Large Language Models (LLMs) show promise in addressing this challenge by efficiently analyzing vast and complex data. Existing benchmarks, however, fall short in properly evaluating LLMs' capabilities in this context, as they typically focus on single-note information or limited topics, failing to reflect the real-world inquiries required by clinicians. To bridge this gap, we introduce EHRNoteQA, a novel benchmark built on the MIMIC-IV EHR, comprising 962 different QA pairs each linked to distinct patients' discharge summaries. Every QA pair is initially generated using GPT-4 and then manually reviewed and refined by three clinicians to ensure clinical relevance. EHRNoteQA includes questions that require information across multiple discharge summaries and covers eight diverse topics, mirroring the complexity and diversity of real clinical inquiries. We offer EHRNoteQA in two formats: open-ended and multi-choice question answering, and propose a reliable evaluation method for each. We evaluate 27 LLMs using EHRNoteQA and examine various factors affecting the model performance (e.g., the length and number of discharge summaries). Furthermore, to validate EHRNoteQA as a reliable proxy for expert evaluations in clinical practice, we measure the correlation between the LLM performance on EHRNoteQA, and the LLM performance manually evaluated by clinicians. Results show that LLM performance on EHRNoteQA have higher correlation with clinician-evaluated performance (Spearman: 0.78, Kendall: 0.62) compared to other benchmarks, demonstrating its practical relevance in evaluating LLMs in clinical settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16040v5-abstract-full').style.display = 'none'; document.getElementById('2402.16040v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 (Datasets and Benchmarks)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2311.18508</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DifAugGAN: A Practical Diffusion-style Data Augmentation for GAN-based Single Image Super-resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+A">Axi Niu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&query=Tee%2C+J+T+J">Joshua Tian Jin Tee</a>, <a href="/search/cs?searchtype=author&query=Pham%2C+T+X">Trung X. Pham</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jinqiu Sun</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+C+D">Chang D. Yoo</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanning Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18508v1-abstract-short" style="display: inline;"> It is well known the adversarial optimization of GAN-based image super-resolution (SR) methods makes the preceding SR model generate unpleasant and undesirable artifacts, leading to large distortion. We attribute the cause of such distortions to the poor calibration of the discriminator, which hampers its ability to provide meaningful feedback to the generator for learning high-quality images. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18508v1-abstract-full').style.display = 'inline'; document.getElementById('2311.18508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18508v1-abstract-full" style="display: none;"> It is well known the adversarial optimization of GAN-based image super-resolution (SR) methods makes the preceding SR model generate unpleasant and undesirable artifacts, leading to large distortion. We attribute the cause of such distortions to the poor calibration of the discriminator, which hampers its ability to provide meaningful feedback to the generator for learning high-quality images. To address this problem, we propose a simple but non-travel diffusion-style data augmentation scheme for current GAN-based SR methods, known as DifAugGAN. It involves adapting the diffusion process in generative diffusion models for improving the calibration of the discriminator during training motivated by the successes of data augmentation schemes in the field to achieve good calibration. Our DifAugGAN can be a Plug-and-Play strategy for current GAN-based SISR methods to improve the calibration of the discriminator and thus improve SR performance. Extensive experimental evaluations demonstrate the superiority of DifAugGAN over state-of-the-art GAN-based SISR methods across both synthetic and real-world datasets, showcasing notable advancements in both qualitative and quantitative results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18508v1-abstract-full').style.display = 'none'; document.getElementById('2311.18508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2311.04430</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Blurry Video Compression: A Trade-off between Visual Enhancement and Data Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Argaw%2C+D+M">Dawit Mureja Argaw</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junsik Kim</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.04430v1-abstract-short" style="display: inline;"> Existing video compression (VC) methods primarily aim to reduce the spatial and temporal redundancies between consecutive frames in a video while preserving its quality. In this regard, previous works have achieved remarkable results on videos acquired under specific settings such as instant (known) exposure time and shutter speed which often result in sharp videos. However, when these methods are… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.04430v1-abstract-full').style.display = 'inline'; document.getElementById('2311.04430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.04430v1-abstract-full" style="display: none;"> Existing video compression (VC) methods primarily aim to reduce the spatial and temporal redundancies between consecutive frames in a video while preserving its quality. In this regard, previous works have achieved remarkable results on videos acquired under specific settings such as instant (known) exposure time and shutter speed which often result in sharp videos. However, when these methods are evaluated on videos captured under different temporal priors, which lead to degradations like motion blur and low frame rate, they fail to maintain the quality of the contents. In this work, we tackle the VC problem in a general scenario where a given video can be blurry due to predefined camera settings or dynamics in the scene. By exploiting the natural trade-off between visual enhancement and data compression, we formulate VC as a min-max optimization problem and propose an effective framework and training strategy to tackle the problem. Extensive experimental results on several benchmark datasets confirm the effectiveness of our method compared to several state-of-the-art VC approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.04430v1-abstract-full').style.display = 'none'; document.getElementById('2311.04430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WACV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2310.18652</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EHRXQA: A Multi-Modal Question Answering Dataset for Electronic Health Records with Chest X-ray Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bae%2C+S">Seongsu Bae</a>, <a href="/search/cs?searchtype=author&query=Kyung%2C+D">Daeun Kyung</a>, <a href="/search/cs?searchtype=author&query=Ryu%2C+J">Jaehee Ryu</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+E">Eunbyeol Cho</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gyubok Lee</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+S">Sunjun Kweon</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+J">Jungwoo Oh</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+L">Lei Ji</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+E+I">Eric I-Chao Chang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T">Tackeun Kim</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Edward Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.18652v2-abstract-short" style="display: inline;"> Electronic Health Records (EHRs), which contain patients' medical histories in various multi-modal formats, often overlook the potential for joint reasoning across imaging and table modalities underexplored in current EHR Question Answering (QA) systems. In this paper, we introduce EHRXQA, a novel multi-modal question answering dataset combining structured EHRs and chest X-ray images. To develop o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.18652v2-abstract-full').style.display = 'inline'; document.getElementById('2310.18652v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.18652v2-abstract-full" style="display: none;"> Electronic Health Records (EHRs), which contain patients' medical histories in various multi-modal formats, often overlook the potential for joint reasoning across imaging and table modalities underexplored in current EHR Question Answering (QA) systems. In this paper, we introduce EHRXQA, a novel multi-modal question answering dataset combining structured EHRs and chest X-ray images. To develop our dataset, we first construct two uni-modal resources: 1) The MIMIC-CXR-VQA dataset, our newly created medical visual question answering (VQA) benchmark, specifically designed to augment the imaging modality in EHR QA, and 2) EHRSQL (MIMIC-IV), a refashioned version of a previously established table-based EHR QA dataset. By integrating these two uni-modal resources, we successfully construct a multi-modal EHR QA dataset that necessitates both uni-modal and cross-modal reasoning. To address the unique challenges of multi-modal questions within EHRs, we propose a NeuralSQL-based strategy equipped with an external VQA API. This pioneering endeavor enhances engagement with multi-modal EHR sources and we believe that our dataset can catalyze advances in real-world medical scenarios such as clinical decision-making and research. EHRXQA is available at <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.18652v2-abstract-full').style.display = 'none'; document.getElementById('2310.18652v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2023 Datasets and Benchmarks Track (10 pages for main text, 4 pages for references, 39 pages for supplementary materials)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2309.11711</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MoDA: Leveraging Motion Priors from Videos for Advancing Unsupervised Domain Adaptation in Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+F">Fei Pan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xu Yin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seokju Lee</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+A">Axi Niu</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+S">Sungeui Yoon</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11711v2-abstract-short" style="display: inline;"> Unsupervised domain adaptation (UDA) has been a potent technique to handle the lack of annotations in the target domain, particularly in semantic segmentation task. This study introduces a different UDA scenarios where the target domain contains unlabeled video frames. Drawing upon recent advancements of self-supervised learning of the object motion from unlabeled videos with geometric constraint,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11711v2-abstract-full').style.display = 'inline'; document.getElementById('2309.11711v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11711v2-abstract-full" style="display: none;"> Unsupervised domain adaptation (UDA) has been a potent technique to handle the lack of annotations in the target domain, particularly in semantic segmentation task. This study introduces a different UDA scenarios where the target domain contains unlabeled video frames. Drawing upon recent advancements of self-supervised learning of the object motion from unlabeled videos with geometric constraint, we design a \textbf{Mo}tion-guided \textbf{D}omain \textbf{A}daptive semantic segmentation framework (MoDA). MoDA harnesses the self-supervised object motion cues to facilitate cross-domain alignment for segmentation task. First, we present an object discovery module to localize and segment target moving objects using object motion information. Then, we propose a semantic mining module that takes the object masks to refine the pseudo labels in the target domain. Subsequently, these high-quality pseudo labels are used in the self-training loop to bridge the cross-domain gap. On domain adaptive video and image segmentation experiments, MoDA shows the effectiveness utilizing object motion as guidance for domain alignment compared with optical flow information. Moreover, MoDA exhibits versatility as it can complement existing state-of-the-art UDA approaches. Code at <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11711v2-abstract-full').style.display = 'none'; document.getElementById('2309.11711v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024 Workshop on Learning with Limited Labelled Data for Image and Video Understanding. Best Paper Award</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2309.01961</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NICE: CVPR 2023 Challenge on Zero-shot Image Captioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taehoon Kim</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+P">Pyunghwan Ahn</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sangyun Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sihaeng Lee</a>, <a href="/search/cs?searchtype=author&query=Marsden%2C+M">Mark Marsden</a>, <a href="/search/cs?searchtype=author&query=Sala%2C+A">Alessandra Sala</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+H">Seung Hwan Kim</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bohyung Han</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K+M">Kyoung Mu Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Honglak Lee</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+K">Kyounghoon Bae</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiangyu Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yi Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hailiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+W">Weili Guo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jianfeng Lu</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+Y">Youngtaek Oh</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J+W">Jae Won Cho</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dong-jin Kim</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junmo Kim</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+W">Wooyoung Kang</a>, <a href="/search/cs?searchtype=author&query=Jhoo%2C+W+Y">Won Young Jhoo</a>, <a href="/search/cs?searchtype=author&query=Roh%2C+B">Byungseok Roh</a> , et al. (17 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.01961v3-abstract-short" style="display: inline;"> In this report, we introduce NICE (New frontiers for zero-shot Image Captioning Evaluation) project and share the results and outcomes of 2023 challenge. This project is designed to challenge the computer vision community to develop robust image captioning models that advance the state-of-the-art both in terms of accuracy and fairness. Through the challenge, the image captioning models were tested… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01961v3-abstract-full').style.display = 'inline'; document.getElementById('2309.01961v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.01961v3-abstract-full" style="display: none;"> In this report, we introduce NICE (New frontiers for zero-shot Image Captioning Evaluation) project and share the results and outcomes of 2023 challenge. This project is designed to challenge the computer vision community to develop robust image captioning models that advance the state-of-the-art both in terms of accuracy and fairness. Through the challenge, the image captioning models were tested using a new evaluation dataset that includes a large variety of visual concepts from many domains. There was no specific training data provided for the challenge, and therefore the challenge entries were required to adapt to new types of image descriptions that had not been seen during training. This report includes information on the newly proposed NICE dataset, evaluation methods, challenge results, and technical details of top-ranking entries. We expect that the outcomes of the challenge will contribute to the improvement of AI models on various vision-language tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01961v3-abstract-full').style.display = 'none'; document.getElementById('2309.01961v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech report, project page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2309.00237</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Publicly Shareable Clinical Large Language Model Built on Synthetic Clinical Notes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kweon%2C+S">Sunjun Kweon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junu Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jiyoun Kim</a>, <a href="/search/cs?searchtype=author&query=Im%2C+S">Sujeong Im</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+E">Eunbyeol Cho</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+S">Seongsu Bae</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+J">Jungwoo Oh</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gyubok Lee</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+J+H">Jong Hak Moon</a>, <a href="/search/cs?searchtype=author&query=You%2C+S+C">Seng Chan You</a>, <a href="/search/cs?searchtype=author&query=Baek%2C+S">Seungjin Baek</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C+H">Chang Hoon Han</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+Y+B">Yoon Bin Jung</a>, <a href="/search/cs?searchtype=author&query=Jo%2C+Y">Yohan Jo</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Edward Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.00237v4-abstract-short" style="display: inline;"> The development of large language models tailored for handling patients' clinical notes is often hindered by the limited accessibility and usability of these notes due to strict privacy regulations. To address these challenges, we first create synthetic large-scale clinical notes using publicly available case reports extracted from biomedical literature. We then use these synthetic notes to train… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00237v4-abstract-full').style.display = 'inline'; document.getElementById('2309.00237v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.00237v4-abstract-full" style="display: none;"> The development of large language models tailored for handling patients' clinical notes is often hindered by the limited accessibility and usability of these notes due to strict privacy regulations. To address these challenges, we first create synthetic large-scale clinical notes using publicly available case reports extracted from biomedical literature. We then use these synthetic notes to train our specialized clinical large language model, Asclepius. While Asclepius is trained on synthetic data, we assess its potential performance in real-world applications by evaluating it using real clinical notes. We benchmark Asclepius against several other large language models, including GPT-3.5-turbo and other open-source alternatives. To further validate our approach using synthetic notes, we also compare Asclepius with its variants trained on real clinical notes. Our findings convincingly demonstrate that synthetic clinical notes can serve as viable substitutes for real ones when constructing high-performing clinical language models. This conclusion is supported by detailed evaluations conducted by both GPT-4 and medical professionals. All resources including weights, codes, and data used in the development of Asclepius are made publicly accessible for future research. ( <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00237v4-abstract-full').style.display = 'none'; document.getElementById('2309.00237v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 (Findings)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2308.09775</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Long-range Multimodal Pretraining for Movie Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Argaw%2C+D+M">Dawit Mureja Argaw</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joon-Young Lee</a>, <a href="/search/cs?searchtype=author&query=Woodson%2C+M">Markus Woodson</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Heilbron%2C+F+C">Fabian Caba Heilbron</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.09775v1-abstract-short" style="display: inline;"> Learning computer vision models from (and for) movies has a long-standing history. While great progress has been attained, there is still a need for a pretrained multimodal model that can perform well in the ever-growing set of movie understanding tasks the community has been establishing. In this work, we introduce Long-range Multimodal Pretraining, a strategy, and a model that leverages movie da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09775v1-abstract-full').style.display = 'inline'; document.getElementById('2308.09775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.09775v1-abstract-full" style="display: none;"> Learning computer vision models from (and for) movies has a long-standing history. While great progress has been attained, there is still a need for a pretrained multimodal model that can perform well in the ever-growing set of movie understanding tasks the community has been establishing. In this work, we introduce Long-range Multimodal Pretraining, a strategy, and a model that leverages movie data to train transferable multimodal and cross-modal encoders. Our key idea is to learn from all modalities in a movie by observing and extracting relationships over a long-range. After pretraining, we run ablation studies on the LVU benchmark and validate our modeling choices and the importance of learning from long-range time spans. Our model achieves state-of-the-art on several LVU tasks while being much more data efficient than previous works. Finally, we evaluate our model's transferability by setting a new state-of-the-art in five different benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09775v1-abstract-full').style.display = 'none'; document.getElementById('2308.09775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2307.00781</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ACDMSR: Accelerated Conditional Diffusion Models for Single Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+A">Axi Niu</a>, <a href="/search/cs?searchtype=author&query=Trung%2C+P+X">Pham Xuan Trung</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jinqiu Sun</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yu Zhu</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanning Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.00781v1-abstract-short" style="display: inline;"> Diffusion models have gained significant popularity in the field of image-to-image translation. Previous efforts applying diffusion models to image super-resolution (SR) have demonstrated that iteratively refining pure Gaussian noise using a U-Net architecture trained on denoising at various noise levels can yield satisfactory high-resolution images from low-resolution inputs. However, this iterat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.00781v1-abstract-full').style.display = 'inline'; document.getElementById('2307.00781v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.00781v1-abstract-full" style="display: none;"> Diffusion models have gained significant popularity in the field of image-to-image translation. Previous efforts applying diffusion models to image super-resolution (SR) have demonstrated that iteratively refining pure Gaussian noise using a U-Net architecture trained on denoising at various noise levels can yield satisfactory high-resolution images from low-resolution inputs. However, this iterative refinement process comes with the drawback of low inference speed, which strongly limits its applications. To speed up inference and further enhance the performance, our research revisits diffusion models in image super-resolution and proposes a straightforward yet significant diffusion model-based super-resolution method called ACDMSR (accelerated conditional diffusion model for image super-resolution). Specifically, our method adapts the standard diffusion model to perform super-resolution through a deterministic iterative denoising process. Our study also highlights the effectiveness of using a pre-trained SR model to provide the conditional image of the given low-resolution (LR) image to achieve superior high-resolution results. We demonstrate that our method surpasses previous attempts in qualitative and quantitative results through extensive experiments conducted on benchmark datasets such as Set5, Set14, Urban100, BSD100, and Manga109. Moreover, our approach generates more visually realistic counterparts for low-resolution images, emphasizing its effectiveness in practical scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.00781v1-abstract-full').style.display = 'none'; document.getElementById('2307.00781v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2302.12831</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2305.18547</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning from Multi-Perception Features for Real-Word Image Super-resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+A">Axi Niu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&query=Pham%2C+T+X">Trung X. Pham</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pei Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jinqiu Sun</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanning Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18547v1-abstract-short" style="display: inline;"> Currently, there are two popular approaches for addressing real-world image super-resolution problems: degradation-estimation-based and blind-based methods. However, degradation-estimation-based methods may be inaccurate in estimating the degradation, making them less applicable to real-world LR images. On the other hand, blind-based methods are often limited by their fixed single perception infor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18547v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18547v1-abstract-full" style="display: none;"> Currently, there are two popular approaches for addressing real-world image super-resolution problems: degradation-estimation-based and blind-based methods. However, degradation-estimation-based methods may be inaccurate in estimating the degradation, making them less applicable to real-world LR images. On the other hand, blind-based methods are often limited by their fixed single perception information, which hinders their ability to handle diverse perceptual characteristics. To overcome this limitation, we propose a novel SR method called MPF-Net that leverages multiple perceptual features of input images. Our method incorporates a Multi-Perception Feature Extraction (MPFE) module to extract diverse perceptual information and a series of newly-designed Cross-Perception Blocks (CPB) to combine this information for effective super-resolution reconstruction. Additionally, we introduce a contrastive regularization term (CR) that improves the model's learning capability by using newly generated HR and LR images as positive and negative samples for ground truth HR. Experimental results on challenging real-world SR datasets demonstrate that our approach significantly outperforms existing state-of-the-art methods in both qualitative and quantitative measures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18547v1-abstract-full').style.display = 'none'; document.getElementById('2305.18547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2305.07288</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Open-WikiTable: Dataset for Open Domain Question Answering with Complex Reasoning over Table </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kweon%2C+S">Sunjun Kweon</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+Y">Yeonsu Kwon</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+S">Seonhee Cho</a>, <a href="/search/cs?searchtype=author&query=Jo%2C+Y">Yohan Jo</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Edward Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.07288v1-abstract-short" style="display: inline;"> Despite recent interest in open domain question answering (ODQA) over tables, many studies still rely on datasets that are not truly optimal for the task with respect to utilizing structural nature of table. These datasets assume answers reside as a single cell value and do not necessitate exploring over multiple cells such as aggregation, comparison, and sorting. Thus, we release Open-WikiTable,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07288v1-abstract-full').style.display = 'inline'; document.getElementById('2305.07288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.07288v1-abstract-full" style="display: none;"> Despite recent interest in open domain question answering (ODQA) over tables, many studies still rely on datasets that are not truly optimal for the task with respect to utilizing structural nature of table. These datasets assume answers reside as a single cell value and do not necessitate exploring over multiple cells such as aggregation, comparison, and sorting. Thus, we release Open-WikiTable, the first ODQA dataset that requires complex reasoning over tables. Open-WikiTable is built upon WikiSQL and WikiTableQuestions to be applicable in the open-domain setting. As each question is coupled with both textual answers and SQL queries, Open-WikiTable opens up a wide range of possibilities for future research, as both reader and parser methods can be applied. The dataset and code are publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07288v1-abstract-full').style.display = 'none'; document.getElementById('2305.07288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2023 (Findings)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2305.00866</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Attack-SAM: Towards Attacking Segment Anything Model With Adversarial Examples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenshuang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+T">Taegoo Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Donghun Kim</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+S">Sung-Ho Bae</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.00866v2-abstract-short" style="display: inline;"> Segment Anything Model (SAM) has attracted significant attention recently, due to its impressive performance on various downstream tasks in a zero-short manner. Computer vision (CV) area might follow the natural language processing (NLP) area to embark on a path from task-specific vision models toward foundation models. However, deep vision models are widely recognized as vulnerable to adversarial… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00866v2-abstract-full').style.display = 'inline'; document.getElementById('2305.00866v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.00866v2-abstract-full" style="display: none;"> Segment Anything Model (SAM) has attracted significant attention recently, due to its impressive performance on various downstream tasks in a zero-short manner. Computer vision (CV) area might follow the natural language processing (NLP) area to embark on a path from task-specific vision models toward foundation models. However, deep vision models are widely recognized as vulnerable to adversarial examples, which fool the model to make wrong predictions with imperceptible perturbation. Such vulnerability to adversarial attacks causes serious concerns when applying deep models to security-sensitive applications. Therefore, it is critical to know whether the vision foundation model SAM can also be fooled by adversarial attacks. To the best of our knowledge, our work is the first of its kind to conduct a comprehensive investigation on how to attack SAM with adversarial examples. With the basic attack goal set to mask removal, we investigate the adversarial robustness of SAM in the full white-box setting and transfer-based black-box settings. Beyond the basic goal of mask removal, we further investigate and find that it is possible to generate any desired mask by the adversarial attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00866v2-abstract-full').style.display = 'none'; document.getElementById('2305.00866v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first work to attack Segment Anything Model with adversarial examples</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2304.06488</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> One Small Step for Generative AI, One Giant Leap for AGI: A Complete Survey on ChatGPT in AIGC Era </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenshuang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenghao Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Dam%2C+S+K">Sumit Kumar Dam</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengchun Zhang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J+U">Jung Uk Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+T">Seong Tae Kim</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J">Jinwoo Choi</a>, <a href="/search/cs?searchtype=author&query=Park%2C+G">Gyeong-Moon Park</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+S">Sung-Ho Bae</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+L">Lik-Hang Lee</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+P">Pan Hui</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C+S">Choong Seon Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.06488v1-abstract-short" style="display: inline;"> OpenAI has recently released GPT-4 (a.k.a. ChatGPT plus), which is demonstrated to be one small step for generative AI (GAI), but one giant leap for artificial general intelligence (AGI). Since its official release in November 2022, ChatGPT has quickly attracted numerous users with extensive media coverage. Such unprecedented attention has also motivated numerous researchers to investigate ChatGPT… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.06488v1-abstract-full').style.display = 'inline'; document.getElementById('2304.06488v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.06488v1-abstract-full" style="display: none;"> OpenAI has recently released GPT-4 (a.k.a. ChatGPT plus), which is demonstrated to be one small step for generative AI (GAI), but one giant leap for artificial general intelligence (AGI). Since its official release in November 2022, ChatGPT has quickly attracted numerous users with extensive media coverage. Such unprecedented attention has also motivated numerous researchers to investigate ChatGPT from various aspects. According to Google scholar, there are more than 500 articles with ChatGPT in their titles or mentioning it in their abstracts. Considering this, a review is urgently needed, and our work fills this gap. Overall, this work is the first to survey ChatGPT with a comprehensive review of its underlying technology, applications, and challenges. Moreover, we present an outlook on how ChatGPT might evolve to realize general-purpose AIGC (a.k.a. AI-generated content), which will be a significant milestone for the development of AGI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.06488v1-abstract-full').style.display = 'none'; document.getElementById('2304.06488v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A Survey on ChatGPT and GPT-4, 29 pages. Feedback is appreciated (</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2304.04694</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Video-kMaX: A Simple Unified Approach for Online and Near-Online Video Panoptic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shin%2C+I">Inkyu Shin</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dahun Kim</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qihang Yu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jun Xie</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hong-Seok Kim</a>, <a href="/search/cs?searchtype=author&query=Green%2C+B">Bradley Green</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang-Chieh Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.04694v1-abstract-short" style="display: inline;"> Video Panoptic Segmentation (VPS) aims to achieve comprehensive pixel-level scene understanding by segmenting all pixels and associating objects in a video. Current solutions can be categorized into online and near-online approaches. Evolving over the time, each category has its own specialized designs, making it nontrivial to adapt models between different categories. To alleviate the discrepancy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.04694v1-abstract-full').style.display = 'inline'; document.getElementById('2304.04694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.04694v1-abstract-full" style="display: none;"> Video Panoptic Segmentation (VPS) aims to achieve comprehensive pixel-level scene understanding by segmenting all pixels and associating objects in a video. Current solutions can be categorized into online and near-online approaches. Evolving over the time, each category has its own specialized designs, making it nontrivial to adapt models between different categories. To alleviate the discrepancy, in this work, we propose a unified approach for online and near-online VPS. The meta architecture of the proposed Video-kMaX consists of two components: within clip segmenter (for clip-level segmentation) and cross-clip associater (for association beyond clips). We propose clip-kMaX (clip k-means mask transformer) and HiLA-MB (Hierarchical Location-Aware Memory Buffer) to instantiate the segmenter and associater, respectively. Our general formulation includes the online scenario as a special case by adopting clip length of one. Without bells and whistles, Video-kMaX sets a new state-of-the-art on KITTI-STEP and VIPSeg for video panoptic segmentation, and VSPW for video semantic segmentation. Code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.04694v1-abstract-full').style.display = 'none'; document.getElementById('2304.04694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2303.17517</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Hindi as a Second Language: Improving Visually Grounded Speech with Semantically Similar Samples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ryu%2C+H">Hyeonggon Ryu</a>, <a href="/search/cs?searchtype=author&query=Senocak%2C+A">Arda Senocak</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.17517v1-abstract-short" style="display: inline;"> The objective of this work is to explore the learning of visually grounded speech models (VGS) from multilingual perspective. Bilingual VGS models are generally trained with an equal number of spoken captions from both languages. However, in reality, there can be an imbalance among the languages for the available spoken captions. Our key contribution in this work is to leverage the power of a high… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17517v1-abstract-full').style.display = 'inline'; document.getElementById('2303.17517v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.17517v1-abstract-full" style="display: none;"> The objective of this work is to explore the learning of visually grounded speech models (VGS) from multilingual perspective. Bilingual VGS models are generally trained with an equal number of spoken captions from both languages. However, in reality, there can be an imbalance among the languages for the available spoken captions. Our key contribution in this work is to leverage the power of a high-resource language in a bilingual visually grounded speech model to improve the performance of a low-resource language. We introduce two methods to distill the knowledge of high-resource language into low-resource languages: (1) incorporating a strong pre-trained high-resource language encoder and (2) using semantically similar spoken captions. Our experiments show that combining these two approaches effectively enables the low-resource language to surpass the performances of monolingual and bilingual counterparts for cross-modal retrieval tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17517v1-abstract-full').style.display = 'none'; document.getElementById('2303.17517v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2303.17386</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Complementary Random Masking for RGB-Thermal Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shin%2C+U">Ukcheol Shin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kyunghyun Lee</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+J">Jean Oh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.17386v2-abstract-short" style="display: inline;"> RGB-thermal semantic segmentation is one potential solution to achieve reliable semantic scene understanding in adverse weather and lighting conditions. However, the previous studies mostly focus on designing a multi-modal fusion module without consideration of the nature of multi-modality inputs. Therefore, the networks easily become over-reliant on a single modality, making it difficult to learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17386v2-abstract-full').style.display = 'inline'; document.getElementById('2303.17386v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.17386v2-abstract-full" style="display: none;"> RGB-thermal semantic segmentation is one potential solution to achieve reliable semantic scene understanding in adverse weather and lighting conditions. However, the previous studies mostly focus on designing a multi-modal fusion module without consideration of the nature of multi-modality inputs. Therefore, the networks easily become over-reliant on a single modality, making it difficult to learn complementary and meaningful representations for each modality. This paper proposes 1) a complementary random masking strategy of RGB-T images and 2) self-distillation loss between clean and masked input modalities. The proposed masking strategy prevents over-reliance on a single modality. It also improves the accuracy and robustness of the neural network by forcing the network to segment and classify objects even when one modality is partially available. Also, the proposed self-distillation loss encourages the network to extract complementary and meaningful representations from a single modality or complementary masked modalities. Based on the proposed method, we achieve state-of-the-art performance over three RGB-T semantic segmentation benchmarks. Our source code is available at <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17386v2-abstract-full').style.display = 'none'; document.getElementById('2303.17386v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICRA 2024, Our source code is available at</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2303.16730</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TTA-COPE: Test-Time Adaptation for Category-Level Object Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+T">Taeyeop Lee</a>, <a href="/search/cs?searchtype=author&query=Tremblay%2C+J">Jonathan Tremblay</a>, <a href="/search/cs?searchtype=author&query=Blukis%2C+V">Valts Blukis</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bowen Wen</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Byeong-Uk Lee</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+I">Inkyu Shin</a>, <a href="/search/cs?searchtype=author&query=Birchfield%2C+S">Stan Birchfield</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.16730v1-abstract-short" style="display: inline;"> Test-time adaptation methods have been gaining attention recently as a practical solution for addressing source-to-target domain gaps by gradually updating the model without requiring labels on the target data. In this paper, we propose a method of test-time adaptation for category-level object pose estimation called TTA-COPE. We design a pose ensemble approach with a self-training loss using pose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.16730v1-abstract-full').style.display = 'inline'; document.getElementById('2303.16730v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.16730v1-abstract-full" style="display: none;"> Test-time adaptation methods have been gaining attention recently as a practical solution for addressing source-to-target domain gaps by gradually updating the model without requiring labels on the target data. In this paper, we propose a method of test-time adaptation for category-level object pose estimation called TTA-COPE. We design a pose ensemble approach with a self-training loss using pose-aware confidence. Unlike previous unsupervised domain adaptation methods for category-level object pose estimation, our approach processes the test data in a sequential, online manner, and it does not require access to the source domain at runtime. Extensive experimental results demonstrate that the proposed pose ensemble and the self-training loss improve category-level object pose performance during test time under both semi-supervised and unsupervised settings. Project page: <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.16730v1-abstract-full').style.display = 'none'; document.getElementById('2303.16730v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2023, Project page:</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2303.13336</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Audio Diffusion Models: Text To Speech Synthesis and Enhancement in Generative AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenshuang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengchun Zhang</a>, <a href="/search/cs?searchtype=author&query=Qamar%2C+M">Maryam Qamar</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+S">Sung-Ho Bae</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.13336v2-abstract-short" style="display: inline;"> Generative AI has demonstrated impressive performance in various fields, among which speech synthesis is an interesting direction. With the diffusion model as the most popular generative model, numerous works have attempted two active tasks: text to speech and speech enhancement. This work conducts a survey on audio diffusion model, which is complementary to existing surveys that either lack the r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13336v2-abstract-full').style.display = 'inline'; document.getElementById('2303.13336v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.13336v2-abstract-full" style="display: none;"> Generative AI has demonstrated impressive performance in various fields, among which speech synthesis is an interesting direction. With the diffusion model as the most popular generative model, numerous works have attempted two active tasks: text to speech and speech enhancement. This work conducts a survey on audio diffusion model, which is complementary to existing surveys that either lack the recent progress of diffusion-based speech synthesis or highlight an overall picture of applying diffusion model in multiple fields. Specifically, this work first briefly introduces the background of audio and diffusion model. As for the text-to-speech task, we divide the methods into three categories based on the stage where diffusion model is adopted: acoustic model, vocoder and end-to-end framework. Moreover, we categorize various speech enhancement tasks by either certain signals are removed or added into the input speech. Comparisons of experimental results and discussions are also covered in this survey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13336v2-abstract-full').style.display = 'none'; document.getElementById('2303.13336v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2303.11771</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-Sufficient Framework for Continuous Sign Language Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jang%2C+Y">Youngjoon Jang</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+Y">Youngtaek Oh</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J+W">Jae Won Cho</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Myungchul Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dong-Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.11771v1-abstract-short" style="display: inline;"> The goal of this work is to develop self-sufficient framework for Continuous Sign Language Recognition (CSLR) that addresses key issues of sign language recognition. These include the need for complex multi-scale features such as hands, face, and mouth for understanding, and absence of frame-level annotations. To this end, we propose (1) Divide and Focus Convolution (DFConv) which extracts both ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.11771v1-abstract-full').style.display = 'inline'; document.getElementById('2303.11771v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.11771v1-abstract-full" style="display: none;"> The goal of this work is to develop self-sufficient framework for Continuous Sign Language Recognition (CSLR) that addresses key issues of sign language recognition. These include the need for complex multi-scale features such as hands, face, and mouth for understanding, and absence of frame-level annotations. To this end, we propose (1) Divide and Focus Convolution (DFConv) which extracts both manual and non-manual features without the need for additional networks or annotations, and (2) Dense Pseudo-Label Refinement (DPLR) which propagates non-spiky frame-level pseudo-labels by combining the ground truth gloss sequence labels with the predicted sequence. We demonstrate that our model achieves state-of-the-art performance among RGB-based methods on large-scale CSLR benchmarks, PHOENIX-2014 and PHOENIX-2014-T, while showing comparable results with better efficiency when compared to other approaches that use multi-modality or extra annotations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.11771v1-abstract-full').style.display = 'none'; document.getElementById('2303.11771v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2303.11717</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> A Complete Survey on Generative AI (AIGC): Is ChatGPT from GPT-4 to GPT-5 All You Need? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenshuang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenghao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengchun Zhang</a>, <a href="/search/cs?searchtype=author&query=Dam%2C+S+K">Sumit Kumar Dam</a>, <a href="/search/cs?searchtype=author&query=Thwal%2C+C+M">Chu Myaet Thwal</a>, <a href="/search/cs?searchtype=author&query=Tun%2C+Y+L">Ye Lin Tun</a>, <a href="/search/cs?searchtype=author&query=Huy%2C+L+L">Le Luang Huy</a>, <a href="/search/cs?searchtype=author&query=kim%2C+D">Donguk kim</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+S">Sung-Ho Bae</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+L">Lik-Hang Lee</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H+T">Heng Tao Shen</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C+S">Choong Seon Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.11717v1-abstract-short" style="display: inline;"> As ChatGPT goes viral, generative AI (AIGC, a.k.a AI-generated content) has made headlines everywhere because of its ability to analyze and create text, images, and beyond. With such overwhelming media coverage, it is almost impossible for us to miss the opportunity to glimpse AIGC from a certain angle. In the era of AI transitioning from pure analysis to creation, it is worth noting that ChatGPT,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.11717v1-abstract-full').style.display = 'inline'; document.getElementById('2303.11717v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.11717v1-abstract-full" style="display: none;"> As ChatGPT goes viral, generative AI (AIGC, a.k.a AI-generated content) has made headlines everywhere because of its ability to analyze and create text, images, and beyond. With such overwhelming media coverage, it is almost impossible for us to miss the opportunity to glimpse AIGC from a certain angle. In the era of AI transitioning from pure analysis to creation, it is worth noting that ChatGPT, with its most recent language model GPT-4, is just a tool out of numerous AIGC tasks. Impressed by the capability of the ChatGPT, many people are wondering about its limits: can GPT-5 (or other future GPT variants) help ChatGPT unify all AIGC tasks for diversified content creation? Toward answering this question, a comprehensive review of existing AIGC tasks is needed. As such, our work comes to fill this gap promptly by offering a first look at AIGC, ranging from its techniques to applications. Modern generative AI relies on various technical foundations, ranging from model architecture and self-supervised pretraining to generative modeling methods (like GAN and diffusion models). After introducing the fundamental techniques, this work focuses on the technological development of various AIGC tasks based on their output type, including text, images, videos, 3D content, etc., which depicts the full potential of ChatGPT's future. Moreover, we summarize their significant applications in some mainstream industries, such as education and creativity content. Finally, we discuss the challenges currently faced and present an outlook on how generative AI might evolve in the near future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.11717v1-abstract-full').style.display = 'none'; document.getElementById('2303.11717v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">56 pages, 548 citations</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2303.07909</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Text-to-image Diffusion Models in Generative AI: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenshuang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengchun Zhang</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junmo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.07909v3-abstract-short" style="display: inline;"> This survey reviews the progress of diffusion models in generating images from text, ~\textit{i.e.} text-to-image diffusion models. As a self-contained work, this survey starts with a brief introduction of how diffusion models work for image synthesis, followed by the background for text-conditioned image synthesis. Based on that, we present an organized review of pioneering methods and their impr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.07909v3-abstract-full').style.display = 'inline'; document.getElementById('2303.07909v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.07909v3-abstract-full" style="display: none;"> This survey reviews the progress of diffusion models in generating images from text, ~\textit{i.e.} text-to-image diffusion models. As a self-contained work, this survey starts with a brief introduction of how diffusion models work for image synthesis, followed by the background for text-conditioned image synthesis. Based on that, we present an organized review of pioneering methods and their improvements on text-to-image generation. We further summarize applications beyond image generation, such as text-guided generation for various modalities like videos, and text-guided image editing. Beyond the progress made so far, we discuss existing challenges and promising future directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.07909v3-abstract-full').style.display = 'none'; document.getElementById('2303.07909v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First survey on the recent progress of text-to-image generation based on the diffusion model</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2303.01904</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EcoTTA: Memory-Efficient Continual Test-time Adaptation via Self-distilled Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+J">Junha Song</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jungsoo Lee</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+S">Sungha Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.01904v4-abstract-short" style="display: inline;"> This paper presents a simple yet effective approach that improves continual test-time adaptation (TTA) in a memory-efficient manner. TTA may primarily be conducted on edge devices with limited memory, so reducing memory is crucial but has been overlooked in previous TTA studies. In addition, long-term adaptation often leads to catastrophic forgetting and error accumulation, which hinders applying… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01904v4-abstract-full').style.display = 'inline'; document.getElementById('2303.01904v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.01904v4-abstract-full" style="display: none;"> This paper presents a simple yet effective approach that improves continual test-time adaptation (TTA) in a memory-efficient manner. TTA may primarily be conducted on edge devices with limited memory, so reducing memory is crucial but has been overlooked in previous TTA studies. In addition, long-term adaptation often leads to catastrophic forgetting and error accumulation, which hinders applying TTA in real-world deployments. Our approach consists of two components to address these issues. First, we present lightweight meta networks that can adapt the frozen original networks to the target domain. This novel architecture minimizes memory consumption by decreasing the size of intermediate activations required for backpropagation. Second, our novel self-distilled regularization controls the output of the meta networks not to deviate significantly from the output of the frozen original networks, thereby preserving well-trained knowledge from the source domain. Without additional memory, this regularization prevents error accumulation and catastrophic forgetting, resulting in stable performance even in long-term test-time adaptation. We demonstrate that our simple yet effective strategy outperforms other state-of-the-art methods on various benchmarks for image classification and semantic segmentation tasks. Notably, our proposed method with ResNet-50 and WideResNet-40 takes 86% and 80% less memory than the recent state-of-the-art method, CoTTA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01904v4-abstract-full').style.display = 'none'; document.getElementById('2303.01904v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2023, Project page:</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2302.12831</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CDPMSR: Conditional Diffusion Probabilistic Models for Single Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+A">Axi Niu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&query=Pham%2C+T+X">Trung X. Pham</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jinqiu Sun</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yu Zhu</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanning Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.12831v1-abstract-short" style="display: inline;"> Diffusion probabilistic models (DPM) have been widely adopted in image-to-image translation to generate high-quality images. Prior attempts at applying the DPM to image super-resolution (SR) have shown that iteratively refining a pure Gaussian noise with a conditional image using a U-Net trained on denoising at various-level noises can help obtain a satisfied high-resolution image for the low-reso… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12831v1-abstract-full').style.display = 'inline'; document.getElementById('2302.12831v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.12831v1-abstract-full" style="display: none;"> Diffusion probabilistic models (DPM) have been widely adopted in image-to-image translation to generate high-quality images. Prior attempts at applying the DPM to image super-resolution (SR) have shown that iteratively refining a pure Gaussian noise with a conditional image using a U-Net trained on denoising at various-level noises can help obtain a satisfied high-resolution image for the low-resolution one. To further improve the performance and simplify current DPM-based super-resolution methods, we propose a simple but non-trivial DPM-based super-resolution post-process framework,i.e., cDPMSR. After applying a pre-trained SR model on the to-be-test LR image to provide the conditional input, we adapt the standard DPM to conduct conditional image generation and perform super-resolution through a deterministic iterative denoising process. Our method surpasses prior attempts on both qualitative and quantitative results and can generate more photo-realistic counterparts for the low-resolution images with various benchmark datasets including Set5, Set14, Urban100, BSD100, and Manga109. Code will be published after accepted. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12831v1-abstract-full').style.display = 'none'; document.getElementById('2302.12831v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2301.11174</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Semi-Supervised Image Captioning by Adversarially Propagating Labeled Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dong-Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+T">Tae-Hyun Oh</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J">Jinsoo Choi</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.11174v1-abstract-short" style="display: inline;"> We present a novel data-efficient semi-supervised framework to improve the generalization of image captioning models. Constructing a large-scale labeled image captioning dataset is an expensive task in terms of labor, time, and cost. In contrast to manually annotating all the training samples, separately collecting uni-modal datasets is immensely easier, e.g., a large-scale image dataset and a sen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.11174v1-abstract-full').style.display = 'inline'; document.getElementById('2301.11174v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.11174v1-abstract-full" style="display: none;"> We present a novel data-efficient semi-supervised framework to improve the generalization of image captioning models. Constructing a large-scale labeled image captioning dataset is an expensive task in terms of labor, time, and cost. In contrast to manually annotating all the training samples, separately collecting uni-modal datasets is immensely easier, e.g., a large-scale image dataset and a sentence dataset. We leverage such massive unpaired image and caption data upon standard paired data by learning to associate them. To this end, our proposed semi-supervised learning method assigns pseudo-labels to unpaired samples in an adversarial learning fashion, where the joint distribution of image and caption is learned. Our method trains a captioner to learn from a paired data and to progressively associate unpaired data. This approach shows noticeable performance improvement even in challenging scenarios including out-of-task data (i.e., relational captioning, where the target task is different from the unpaired data) and web-crawled data. We also show that our proposed method is theoretically well-motivated and has a favorable global optimal property. Our extensive and comprehensive empirical results both on (1) image-based and (2) dense region-based captioning datasets followed by comprehensive analysis on the scarcely-paired COCO dataset demonstrate the consistent effectiveness of our semisupervised learning method with unpaired data compared to competing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.11174v1-abstract-full').style.display = 'none'; document.getElementById('2301.11174v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Journal extension of our EMNLP 2019 paper (arXiv:1909.02201)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2301.00808</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Woo%2C+S">Sanghyun Woo</a>, <a href="/search/cs?searchtype=author&query=Debnath%2C+S">Shoubhik Debnath</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Ronghang Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinlei Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhuang Liu</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.00808v1-abstract-short" style="display: inline;"> Driven by improved architectures and better representation learning frameworks, the field of visual recognition has enjoyed rapid modernization and performance boost in the early 2020s. For example, modern ConvNets, represented by ConvNeXt, have demonstrated strong performance in various scenarios. While these models were originally designed for supervised learning with ImageNet labels, they can a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.00808v1-abstract-full').style.display = 'inline'; document.getElementById('2301.00808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.00808v1-abstract-full" style="display: none;"> Driven by improved architectures and better representation learning frameworks, the field of visual recognition has enjoyed rapid modernization and performance boost in the early 2020s. For example, modern ConvNets, represented by ConvNeXt, have demonstrated strong performance in various scenarios. While these models were originally designed for supervised learning with ImageNet labels, they can also potentially benefit from self-supervised learning techniques such as masked autoencoders (MAE). However, we found that simply combining these two approaches leads to subpar performance. In this paper, we propose a fully convolutional masked autoencoder framework and a new Global Response Normalization (GRN) layer that can be added to the ConvNeXt architecture to enhance inter-channel feature competition. This co-design of self-supervised learning techniques and architectural improvement results in a new model family called ConvNeXt V2, which significantly improves the performance of pure ConvNets on various recognition benchmarks, including ImageNet classification, COCO detection, and ADE20K segmentation. We also provide pre-trained ConvNeXt V2 models of various sizes, ranging from an efficient 3.7M-parameter Atto model with 76.7% top-1 accuracy on ImageNet, to a 650M Huge model that achieves a state-of-the-art 88.9% accuracy using only public training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.00808v1-abstract-full').style.display = 'none'; document.getElementById('2301.00808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code and models available at</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2212.10149</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tracking by Associating Clips </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Woo%2C+S">Sanghyun Woo</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Kwanyong Park</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S+W">Seoung Wug Oh</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joon-Young Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.10149v1-abstract-short" style="display: inline;"> The tracking-by-detection paradigm today has become the dominant method for multi-object tracking and works by detecting objects in each frame and then performing data association across frames. However, its sequential frame-wise matching property fundamentally suffers from the intermediate interruptions in a video, such as object occlusions, fast camera movements, and abrupt light changes. Moreov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10149v1-abstract-full').style.display = 'inline'; document.getElementById('2212.10149v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.10149v1-abstract-full" style="display: none;"> The tracking-by-detection paradigm today has become the dominant method for multi-object tracking and works by detecting objects in each frame and then performing data association across frames. However, its sequential frame-wise matching property fundamentally suffers from the intermediate interruptions in a video, such as object occlusions, fast camera movements, and abrupt light changes. Moreover, it typically overlooks temporal information beyond the two frames for matching. In this paper, we investigate an alternative by treating object association as clip-wise matching. Our new perspective views a single long video sequence as multiple short clips, and then the tracking is performed both within and between the clips. The benefits of this new approach are two folds. First, our method is robust to tracking error accumulation or propagation, as the video chunking allows bypassing the interrupted frames, and the short clip tracking avoids the conventional error-prone long-term track memory management. Second, the multiple frame information is aggregated during the clip-wise matching, resulting in a more accurate long-range track association than the current frame-wise matching. Given the state-of-the-art tracking-by-detection tracker, QDTrack, we showcase how the tracking performance improves with our new tracking formulation. We evaluate our proposals on two tracking benchmarks, TAO and MOT17 that have complementary characteristics and challenges each other. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10149v1-abstract-full').style.display = 'none'; document.getElementById('2212.10149v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2212.10147</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bridging Images and Videos: A Simple Learning Framework for Large Vocabulary Video Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Woo%2C+S">Sanghyun Woo</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Kwanyong Park</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S+W">Seoung Wug Oh</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joon-Young Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.10147v1-abstract-short" style="display: inline;"> Scaling object taxonomies is one of the important steps toward a robust real-world deployment of recognition systems. We have faced remarkable progress in images since the introduction of the LVIS benchmark. To continue this success in videos, a new video benchmark, TAO, was recently presented. Given the recent encouraging results from both detection and tracking communities, we are interested in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10147v1-abstract-full').style.display = 'inline'; document.getElementById('2212.10147v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.10147v1-abstract-full" style="display: none;"> Scaling object taxonomies is one of the important steps toward a robust real-world deployment of recognition systems. We have faced remarkable progress in images since the introduction of the LVIS benchmark. To continue this success in videos, a new video benchmark, TAO, was recently presented. Given the recent encouraging results from both detection and tracking communities, we are interested in marrying those two advances and building a strong large vocabulary video tracker. However, supervisions in LVIS and TAO are inherently sparse or even missing, posing two new challenges for training the large vocabulary trackers. First, no tracking supervisions are in LVIS, which leads to inconsistent learning of detection (with LVIS and TAO) and tracking (only with TAO). Second, the detection supervisions in TAO are partial, which results in catastrophic forgetting of absent LVIS categories during video fine-tuning. To resolve these challenges, we present a simple but effective learning framework that takes full advantage of all available training data to learn detection and tracking while not losing any LVIS categories to recognize. With this new learning scheme, we show that consistent improvements of various large vocabulary trackers are capable, setting strong baseline results on the challenging TAO benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10147v1-abstract-full').style.display = 'none'; document.getElementById('2212.10147v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2212.08356</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Test-time Adaptation in the Dynamic World with Compound Domain Knowledge Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+J">Junha Song</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Kwanyong Park</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+I">InKyu Shin</a>, <a href="/search/cs?searchtype=author&query=Woo%2C+S">Sanghyun Woo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08356v3-abstract-short" style="display: inline;"> Prior to the deployment of robotic systems, pre-training the deep-recognition models on all potential visual cases is infeasible in practice. Hence, test-time adaptation (TTA) allows the model to adapt itself to novel environments and improve its performance during test time (i.e., lifelong adaptation). Several works for TTA have shown promising adaptation performances in continuously changing env… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08356v3-abstract-full').style.display = 'inline'; document.getElementById('2212.08356v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08356v3-abstract-full" style="display: none;"> Prior to the deployment of robotic systems, pre-training the deep-recognition models on all potential visual cases is infeasible in practice. Hence, test-time adaptation (TTA) allows the model to adapt itself to novel environments and improve its performance during test time (i.e., lifelong adaptation). Several works for TTA have shown promising adaptation performances in continuously changing environments. However, our investigation reveals that existing methods are vulnerable to dynamic distributional changes and often lead to overfitting of TTA models. To address this problem, this paper first presents a robust TTA framework with compound domain knowledge management. Our framework helps the TTA model to harvest the knowledge of multiple representative domains (i.e., compound domain) and conduct the TTA based on the compound domain knowledge. In addition, to prevent overfitting of the TTA model, we devise novel regularization which modulates the adaptation rates using domain-similarity between the source and the current target domain. With the synergy of the proposed framework and regularization, we achieve consistent performance improvements in diverse TTA scenarios, especially on dynamic domain shifts. We demonstrate the generality of proposals via extensive experiments including image classification on ImageNet-C and semantic segmentation on GTA5, C-driving, and corrupted Cityscapes datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08356v3-abstract-full').style.display = 'none'; document.getElementById('2212.08356v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2212.08355</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Classifiers of Prototypes and Reciprocal Points for Universal Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hur%2C+S">Sungsu Hur</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+I">Inkyu Shin</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Kwanyong Park</a>, <a href="/search/cs?searchtype=author&query=Woo%2C+S">Sanghyun Woo</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08355v1-abstract-short" style="display: inline;"> Universal Domain Adaptation aims to transfer the knowledge between the datasets by handling two shifts: domain-shift and category-shift. The main challenge is correctly distinguishing the unknown target samples while adapting the distribution of known class knowledge from source to target. Most existing methods approach this problem by first training the target adapted known classifier and then re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08355v1-abstract-full').style.display = 'inline'; document.getElementById('2212.08355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08355v1-abstract-full" style="display: none;"> Universal Domain Adaptation aims to transfer the knowledge between the datasets by handling two shifts: domain-shift and category-shift. The main challenge is correctly distinguishing the unknown target samples while adapting the distribution of known class knowledge from source to target. Most existing methods approach this problem by first training the target adapted known classifier and then relying on the single threshold to distinguish unknown target samples. However, this simple threshold-based approach prevents the model from considering the underlying complexities existing between the known and unknown samples in the high-dimensional feature space. In this paper, we propose a new approach in which we use two sets of feature points, namely dual Classifiers for Prototypes and Reciprocals (CPR). Our key idea is to associate each prototype with corresponding known class features while pushing the reciprocals apart from these prototypes to locate them in the potential unknown feature space. The target samples are then classified as unknown if they fall near any reciprocals at test time. To successfully train our framework, we collect the partial, confident target samples that are classified as known or unknown through on our proposed multi-criteria selection. We then additionally apply the entropy loss regularization to them. For further adaptation, we also apply standard consistency regularization that matches the predictions of two different views of the input to make more compact target feature space. We evaluate our proposal, CPR, on three standard benchmarks and achieve comparable or new state-of-the-art results. We also provide extensive ablation experiments to verify our main design choices in our framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08355v1-abstract-full').style.display = 'none'; document.getElementById('2212.08355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at WACV 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2211.11432</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MATE: Masked Autoencoders are Online 3D Test-Time Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mirza%2C+M+J">M. Jehanzeb Mirza</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+I">Inkyu Shin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wei Lin</a>, <a href="/search/cs?searchtype=author&query=Schriebl%2C+A">Andreas Schriebl</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+K">Kunyang Sun</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+J">Jaesung Choe</a>, <a href="/search/cs?searchtype=author&query=Possegger%2C+H">Horst Possegger</a>, <a href="/search/cs?searchtype=author&query=Kozinski%2C+M">Mateusz Kozinski</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kun-Jin Yoon</a>, <a href="/search/cs?searchtype=author&query=Bischof%2C+H">Horst Bischof</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.11432v3-abstract-short" style="display: inline;"> Our MATE is the first Test-Time-Training (TTT) method designed for 3D data, which makes deep networks trained for point cloud classification robust to distribution shifts occurring in test data. Like existing TTT methods from the 2D image domain, MATE also leverages test data for adaptation. Its test-time objective is that of a Masked Autoencoder: a large portion of each test point cloud is remove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11432v3-abstract-full').style.display = 'inline'; document.getElementById('2211.11432v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.11432v3-abstract-full" style="display: none;"> Our MATE is the first Test-Time-Training (TTT) method designed for 3D data, which makes deep networks trained for point cloud classification robust to distribution shifts occurring in test data. Like existing TTT methods from the 2D image domain, MATE also leverages test data for adaptation. Its test-time objective is that of a Masked Autoencoder: a large portion of each test point cloud is removed before it is fed to the network, tasked with reconstructing the full point cloud. Once the network is updated, it is used to classify the point cloud. We test MATE on several 3D object classification datasets and show that it significantly improves robustness of deep networks to several types of corruptions commonly occurring in 3D point clouds. We show that MATE is very efficient in terms of the fraction of points it needs for the adaptation. It can effectively adapt given as few as 5% of tokens of each test sample, making it extremely lightweight. Our experiments show that MATE also achieves competitive performance by adapting sparsely on the test data, which further reduces its computational overhead, making it ideal for real-time applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11432v3-abstract-full').style.display = 'none'; document.getElementById('2211.11432v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at this repository:</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2211.00448</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Signing Outside the Studio: Benchmarking Background Robustness for Continuous Sign Language Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jang%2C+Y">Youngjoon Jang</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+Y">Youngtaek Oh</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J+W">Jae Won Cho</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dong-Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.00448v1-abstract-short" style="display: inline;"> The goal of this work is background-robust continuous sign language recognition. Most existing Continuous Sign Language Recognition (CSLR) benchmarks have fixed backgrounds and are filmed in studios with a static monochromatic background. However, signing is not limited only to studios in the real world. In order to analyze the robustness of CSLR models under background shifts, we first evaluate e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00448v1-abstract-full').style.display = 'inline'; document.getElementById('2211.00448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.00448v1-abstract-full" style="display: none;"> The goal of this work is background-robust continuous sign language recognition. Most existing Continuous Sign Language Recognition (CSLR) benchmarks have fixed backgrounds and are filmed in studios with a static monochromatic background. However, signing is not limited only to studios in the real world. In order to analyze the robustness of CSLR models under background shifts, we first evaluate existing state-of-the-art CSLR models on diverse backgrounds. To synthesize the sign videos with a variety of backgrounds, we propose a pipeline to automatically generate a benchmark dataset utilizing existing CSLR benchmarks. Our newly constructed benchmark dataset consists of diverse scenes to simulate a real-world environment. We observe even the most recent CSLR method cannot recognize glosses well on our new dataset with changed backgrounds. In this regard, we also propose a simple yet effective training scheme including (1) background randomization and (2) feature disentanglement for CSLR models. The experimental results on our dataset demonstrate that our method generalizes well to other unseen background data with minimal additional training images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00448v1-abstract-full').style.display = 'none'; document.getElementById('2211.00448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our dataset is available at</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2210.12126</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> One-Shot Neural Fields for 3D Object Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Blukis%2C+V">Valts Blukis</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+T">Taeyeop Lee</a>, <a href="/search/cs?searchtype=author&query=Tremblay%2C+J">Jonathan Tremblay</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bowen Wen</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+D">Dieter Fox</a>, <a href="/search/cs?searchtype=author&query=Birchfield%2C+S">Stan Birchfield</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.12126v3-abstract-short" style="display: inline;"> We present a unified and compact scene representation for robotics, where each object in the scene is depicted by a latent code capturing geometry and appearance. This representation can be decoded for various tasks such as novel view rendering, 3D reconstruction (e.g. recovering depth, point clouds, or voxel maps), collision checking, and stable grasp prediction. We build our representation from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12126v3-abstract-full').style.display = 'inline'; document.getElementById('2210.12126v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.12126v3-abstract-full" style="display: none;"> We present a unified and compact scene representation for robotics, where each object in the scene is depicted by a latent code capturing geometry and appearance. This representation can be decoded for various tasks such as novel view rendering, 3D reconstruction (e.g. recovering depth, point clouds, or voxel maps), collision checking, and stable grasp prediction. We build our representation from a single RGB input image at test time by leveraging recent advances in Neural Radiance Fields (NeRF) that learn category-level priors on large multiview datasets, then fine-tune on novel objects from one or few views. We expand the NeRF model for additional grasp outputs and explore ways to leverage this representation for robotics. At test-time, we build the representation from a single RGB input image observing the scene from only one viewpoint. We find that the recovered representation allows rendering from novel views, including of occluded object parts, and also for predicting successful stable grasps. Grasp poses can be directly decoded from our latent representation with an implicit grasp decoder. We experimented in both simulation and real world and demonstrated the capability for robust robotic grasping using such compact representation. Website: <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12126v3-abstract-full').style.display = 'none'; document.getElementById('2210.12126v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshop (CVPRW) on XRNeRF: Advances in NeRF for the Metaverse 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2209.05771</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Moving from 2D to 3D: volumetric medical image classification for rectal cancer staging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joohyung Lee</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+J">Jieun Oh</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+I">Inkyu Shin</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">You-sung Kim</a>, <a href="/search/cs?searchtype=author&query=Sohn%2C+D+K">Dae Kyung Sohn</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T">Tae-sung Kim</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.05771v1-abstract-short" style="display: inline;"> Volumetric images from Magnetic Resonance Imaging (MRI) provide invaluable information in preoperative staging of rectal cancer. Above all, accurate preoperative discrimination between T2 and T3 stages is arguably both the most challenging and clinically significant task for rectal cancer treatment, as chemo-radiotherapy is usually recommended to patients with T3 (or greater) stage cancer. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.05771v1-abstract-full').style.display = 'inline'; document.getElementById('2209.05771v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.05771v1-abstract-full" style="display: none;"> Volumetric images from Magnetic Resonance Imaging (MRI) provide invaluable information in preoperative staging of rectal cancer. Above all, accurate preoperative discrimination between T2 and T3 stages is arguably both the most challenging and clinically significant task for rectal cancer treatment, as chemo-radiotherapy is usually recommended to patients with T3 (or greater) stage cancer. In this study, we present a volumetric convolutional neural network to accurately discriminate T2 from T3 stage rectal cancer with rectal MR volumes. Specifically, we propose 1) a custom ResNet-based volume encoder that models the inter-slice relationship with late fusion (i.e., 3D convolution at the last layer), 2) a bilinear computation that aggregates the resulting features from the encoder to create a volume-wise feature, and 3) a joint minimization of triplet loss and focal loss. With MR volumes of pathologically confirmed T2/T3 rectal cancer, we perform extensive experiments to compare various designs within the framework of residual learning. As a result, our network achieves an AUC of 0.831, which is higher than the reported accuracy of the professional radiologist groups. We believe this method can be extended to other volume analysis tasks <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.05771v1-abstract-full').style.display = 'none'; document.getElementById('2209.05771v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 2 figures, accepted to MICCAI 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2208.01924</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Per-Clip Video Object Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+K">Kwanyong Park</a>, <a href="/search/cs?searchtype=author&query=Woo%2C+S">Sanghyun Woo</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S+W">Seoung Wug Oh</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joon-Young Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.01924v1-abstract-short" style="display: inline;"> Recently, memory-based approaches show promising results on semi-supervised video object segmentation. These methods predict object masks frame-by-frame with the help of frequently updated memory of the previous mask. Different from this per-frame inference, we investigate an alternative perspective by treating video object segmentation as clip-wise mask propagation. In this per-clip inference sch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.01924v1-abstract-full').style.display = 'inline'; document.getElementById('2208.01924v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.01924v1-abstract-full" style="display: none;"> Recently, memory-based approaches show promising results on semi-supervised video object segmentation. These methods predict object masks frame-by-frame with the help of frequently updated memory of the previous mask. Different from this per-frame inference, we investigate an alternative perspective by treating video object segmentation as clip-wise mask propagation. In this per-clip inference scheme, we update the memory with an interval and simultaneously process a set of consecutive frames (i.e. clip) between the memory updates. The scheme provides two potential benefits: accuracy gain by clip-level optimization and efficiency gain by parallel computation of multiple frames. To this end, we propose a new method tailored for the per-clip inference. Specifically, we first introduce a clip-wise operation to refine the features based on intra-clip correlation. In addition, we employ a progressive matching mechanism for efficient information-passing within a clip. With the synergy of two modules and a newly proposed per-clip based training, our network achieves state-of-the-art performance on Youtube-VOS 2018/2019 val (84.6% and 84.6%) and DAVIS 2016/2017 val (91.9% and 86.1%). Furthermore, our model shows a great speed-accuracy trade-off with varying memory update intervals, which leads to huge flexibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.01924v1-abstract-full').style.display = 'none'; document.getElementById('2208.01924v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2022; Code is available at</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2208.00690</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generative Bias for Robust Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+J+W">Jae Won Cho</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dong-jin Kim</a>, <a href="/search/cs?searchtype=author&query=Ryu%2C+H">Hyeonggon Ryu</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.00690v3-abstract-short" style="display: inline;"> The task of Visual Question Answering (VQA) is known to be plagued by the issue of VQA models exploiting biases within the dataset to make its final prediction. Various previous ensemble based debiasing methods have been proposed where an additional model is purposefully trained to be biased in order to train a robust target model. However, these methods compute the bias for a model simply from th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00690v3-abstract-full').style.display = 'inline'; document.getElementById('2208.00690v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.00690v3-abstract-full" style="display: none;"> The task of Visual Question Answering (VQA) is known to be plagued by the issue of VQA models exploiting biases within the dataset to make its final prediction. Various previous ensemble based debiasing methods have been proposed where an additional model is purposefully trained to be biased in order to train a robust target model. However, these methods compute the bias for a model simply from the label statistics of the training data or from single modal branches. In this work, in order to better learn the bias a target VQA model suffers from, we propose a generative method to train the bias model directly from the target model, called GenB. In particular, GenB employs a generative network to learn the bias in the target model through a combination of the adversarial objective and knowledge distillation. We then debias our target model with GenB as a bias model, and show through extensive experiments the effects of our method on various VQA bias datasets including VQA-CP2, VQA-CP1, GQA-OOD, and VQA-CE, and show state-of-the-art results with the LXMERT architecture on VQA-CP2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00690v3-abstract-full').style.display = 'none'; document.getElementById('2208.00690v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2208.00173</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Masked Autoencoder for Self-supervised Learning in Vision and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenshuang Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Junha Song</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+J+S+K">John Seon Keun Yi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.00173v1-abstract-short" style="display: inline;"> Masked autoencoders are scalable vision learners, as the title of MAE \cite{he2022masked}, which suggests that self-supervised learning (SSL) in vision might undertake a similar trajectory as in NLP. Specifically, generative pretext tasks with the masked prediction (e.g., BERT) have become a de facto standard SSL practice in NLP. By contrast, early attempts at generative methods in vision have bee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00173v1-abstract-full').style.display = 'inline'; document.getElementById('2208.00173v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.00173v1-abstract-full" style="display: none;"> Masked autoencoders are scalable vision learners, as the title of MAE \cite{he2022masked}, which suggests that self-supervised learning (SSL) in vision might undertake a similar trajectory as in NLP. Specifically, generative pretext tasks with the masked prediction (e.g., BERT) have become a de facto standard SSL practice in NLP. By contrast, early attempts at generative methods in vision have been buried by their discriminative counterparts (like contrastive learning); however, the success of mask image modeling has revived the masking autoencoder (often termed denoising autoencoder in the past). As a milestone to bridge the gap with BERT in NLP, masked autoencoder has attracted unprecedented attention for SSL in vision and beyond. This work conducts a comprehensive survey of masked autoencoders to shed insight on a promising direction of SSL. As the first to review SSL with masked autoencoders, this work focuses on its application in vision by discussing its historical developments, recent progress, and implications for diverse applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00173v1-abstract-full').style.display = 'none'; document.getElementById('2208.00173v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First survey on masked autoencoder (under progress)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2207.10899</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Decoupled Adversarial Contrastive Learning for Self-supervised Adversarial Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaoning Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenshuang Zhang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+A">Axi Niu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jiu Feng</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+C+D">Chang D. Yoo</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.10899v1-abstract-short" style="display: inline;"> Adversarial training (AT) for robust representation learning and self-supervised learning (SSL) for unsupervised representation learning are two active research fields. Integrating AT into SSL, multiple prior works have accomplished a highly significant yet challenging task: learning robust representation without labels. A widely used framework is adversarial contrastive learning which couples AT… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.10899v1-abstract-full').style.display = 'inline'; document.getElementById('2207.10899v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.10899v1-abstract-full" style="display: none;"> Adversarial training (AT) for robust representation learning and self-supervised learning (SSL) for unsupervised representation learning are two active research fields. Integrating AT into SSL, multiple prior works have accomplished a highly significant yet challenging task: learning robust representation without labels. A widely used framework is adversarial contrastive learning which couples AT and SSL, and thus constitute a very complex optimization problem. Inspired by the divide-and-conquer philosophy, we conjecture that it might be simplified as well as improved by solving two sub-problems: non-robust SSL and pseudo-supervised AT. This motivation shifts the focus of the task from seeking an optimal integrating strategy for a coupled problem to finding sub-solutions for sub-problems. With this said, this work discards prior practices of directly introducing AT to SSL frameworks and proposed a two-stage framework termed Decoupled Adversarial Contrastive Learning (DeACL). Extensive experimental results demonstrate that our DeACL achieves SOTA self-supervised adversarial robustness while significantly reducing the training time, which validates its effectiveness and efficiency. Moreover, our DeACL constitutes a more explainable solution, and its success also bridges the gap with semi-supervised AT for exploiting unlabeled samples for robust representation learning. The code is publicly accessible at <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.10899v1-abstract-full').style.display = 'none'; document.getElementById('2207.10899v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2022 oral presentation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="">arXiv:2207.09812</a> <span> [<a href="">pdf</a>, <a href="">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Anatomy of Video Editing: A Dataset and Benchmark Suite for AI-Assisted Video Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Argaw%2C+D+M">Dawit Mureja Argaw</a>, <a href="/search/cs?searchtype=author&query=Heilbron%2C+F+C">Fabian Caba Heilbron</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joon-Young Lee</a>, <a href="/search/cs?searchtype=author&query=Woodson%2C+M">Markus Woodson</a>, <a href="/search/cs?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.09812v2-abstract-short" style="display: inline;"> Machine learning is transforming the video editing industry. Recent advances in computer vision have leveled-up video editing tasks such as intelligent reframing, rotoscoping, color grading, or applying digital makeups. However, most of the solutions have focused on video manipulation and VFX. This work introduces the Anatomy of Video Editing, a dataset, and benchmark, to foster research in AI-ass… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.09812v2-abstract-full').style.display = 'inline'; document.getElementById('2207.09812v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.09812v2-abstract-full" style="display: none;"> Machine learning is transforming the video editing industry. Recent advances in computer vision have leveled-up video editing tasks such as intelligent reframing, rotoscoping, color grading, or applying digital makeups. However, most of the solutions have focused on video manipulation and VFX. This work introduces the Anatomy of Video Editing, a dataset, and benchmark, to foster research in AI-assisted video editing. Our benchmark suite focuses on video editing tasks, beyond visual effects, such as automatic footage organization and assisted video assembling. To enable research on these fronts, we annotate more than 1.5M tags, with relevant concepts to cinematography, from 196176 shots sampled from movie scenes. We establish competitive baseline methods and detailed analyses for each of the tasks. We hope our work sparks innovative research towards underexplored areas of AI-assisted video editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.09812v2-abstract-full').style.display = 'none'; document.getElementById('2207.09812v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at:</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Kweon%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Kweon%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Kweon%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Kweon%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Kweon%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="">About</a></li> <li><a href="">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href=""> Contact</a> </li> <li> <svg xmlns="" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href=""> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="">Copyright</a></li> <li><a href="">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="" target="_blank">arXiv Operational Status <svg xmlns="" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="" target="_blank"><svg xmlns="" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="" target="_blank"><svg xmlns="" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src=""></script> </body> </html>