CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;37 of 37 results for author: <span class="mathjax">Vo, M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Vo%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Vo, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Vo%2C+M&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Vo, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01154">arXiv:2412.01154</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.01154">pdf</a>, <a href="https://arxiv.org/format/2412.01154">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> R.I.P.: A Simple Black-box Attack on Continual Test-time Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hoang%2C+T">Trung-Hieu Hoang</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Do%2C+M+N">Minh N. Do</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01154v1-abstract-short" style="display: inline;"> Test-time adaptation (TTA) has emerged as a promising solution to tackle the continual domain shift in machine learning by allowing model parameters to change at test time, via self-supervised learning on unlabeled testing data. At the same time, it unfortunately opens the door to unforeseen vulnerabilities for degradation over time. Through a simple theoretical continual TTA model, we successfull&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01154v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01154v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01154v1-abstract-full" style="display: none;"> Test-time adaptation (TTA) has emerged as a promising solution to tackle the continual domain shift in machine learning by allowing model parameters to change at test time, via self-supervised learning on unlabeled testing data. At the same time, it unfortunately opens the door to unforeseen vulnerabilities for degradation over time. Through a simple theoretical continual TTA model, we successfully identify a risk in the sampling process of testing data that could easily degrade the performance of a continual TTA model. We name this risk as Reusing of Incorrect Prediction (RIP) that TTA attackers can employ or as a result of the unintended query from general TTA users. The risk posed by RIP is also highly realistic, as it does not require prior knowledge of model parameters or modification of testing samples. This simple requirement makes RIP as the first black-box TTA attack algorithm that stands out from existing white-box attempts. We extensively benchmark the performance of the most recent continual TTA approaches when facing the RIP attack, providing insights on its success, and laying out potential roadmaps that could enhance the resilience of future continual TTA systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01154v1-abstract-full').style.display = 'none'; document.getElementById('2412.01154v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17794">arXiv:2411.17794</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17794">pdf</a>, <a href="https://arxiv.org/format/2411.17794">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NEMO: Can Multimodal LLMs Identify Attribute-Modified Objects? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaxuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Mo%2C+J">Junwen Mo</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">MinhDuc Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Sugimoto%2C+A">Akihiro Sugimoto</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17794v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have made notable advances in visual understanding, yet their abilities to recognize objects modified by specific attributes remain an open question. To address this, we explore MLLMs&#39; reasoning capabilities in object recognition, ranging from commonsense to beyond-commonsense scenarios. We introduce a novel benchmark, NEMO, which comprises 900 images of or&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17794v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17794v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17794v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have made notable advances in visual understanding, yet their abilities to recognize objects modified by specific attributes remain an open question. To address this, we explore MLLMs&#39; reasoning capabilities in object recognition, ranging from commonsense to beyond-commonsense scenarios. We introduce a novel benchmark, NEMO, which comprises 900 images of origiNal fruits and their corresponding attributE-MOdified ones; along with a set of 2,700 questions including open-, multiple-choice-, unsolvable types. We assess 26 recent open-sourced and commercial models using our benchmark. The findings highlight pronounced performance gaps in recognizing objects in NEMO and reveal distinct answer preferences across different models. Although stronger vision encoders improve performance, MLLMs still lag behind standalone vision encoders. Interestingly, scaling up the model size does not consistently yield better outcomes, as deeper analysis reveals that larger LLMs can weaken vision encoders during fine-tuning. These insights shed light on critical limitations in current MLLMs and suggest potential pathways toward developing more versatile and resilient multimodal models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17794v1-abstract-full').style.display = 'none'; document.getElementById('2411.17794v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.06797">arXiv:2312.06797</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.06797">pdf</a>, <a href="https://arxiv.org/format/2312.06797">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improving the Robustness of 3D Human Pose Estimation: A Benchmark and Learning from Noisy Input </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hoang%2C+T">Trung-Hieu Hoang</a>, <a href="/search/cs?searchtype=author&amp;query=Zehni%2C+M">Mona Zehni</a>, <a href="/search/cs?searchtype=author&amp;query=Phan%2C+H">Huy Phan</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Do%2C+M+N">Minh N. Do</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.06797v2-abstract-short" style="display: inline;"> Despite the promising performance of current 3D human pose estimation techniques, understanding and enhancing their generalization on challenging in-the-wild videos remain an open problem. In this work, we focus on the robustness of 2D-to-3D pose lifters. To this end, we develop two benchmark datasets, namely Human3.6M-C and HumanEva-I-C, to examine the robustness of video-based 3D pose lifters to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06797v2-abstract-full').style.display = 'inline'; document.getElementById('2312.06797v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.06797v2-abstract-full" style="display: none;"> Despite the promising performance of current 3D human pose estimation techniques, understanding and enhancing their generalization on challenging in-the-wild videos remain an open problem. In this work, we focus on the robustness of 2D-to-3D pose lifters. To this end, we develop two benchmark datasets, namely Human3.6M-C and HumanEva-I-C, to examine the robustness of video-based 3D pose lifters to a wide range of common video corruptions including temporary occlusion, motion blur, and pixel-level noise. We observe the poor generalization of state-of-the-art 3D pose lifters in the presence of corruption and establish two techniques to tackle this issue. First, we introduce Temporal Additive Gaussian Noise (TAGN) as a simple yet effective 2D input pose data augmentation. Additionally, to incorporate the confidence scores output by the 2D pose detectors, we design a confidence-aware convolution (CA-Conv) block. Extensively tested on corrupted videos, the proposed strategies consistently boost the robustness of 3D pose lifters and serve as new baselines for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06797v2-abstract-full').style.display = 'none'; document.getElementById('2312.06797v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.18193">arXiv:2311.18193</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.18193">pdf</a>, <a href="https://arxiv.org/format/2311.18193">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Persistent Test-time Adaptation in Recurring Testing Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hoang%2C+T">Trung-Hieu Hoang</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Do%2C+M+N">Minh N. Do</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18193v4-abstract-short" style="display: inline;"> Current test-time adaptation (TTA) approaches aim to adapt a machine learning model to environments that change continuously. Yet, it is unclear whether TTA methods can maintain their adaptability over prolonged periods. To answer this question, we introduce a diagnostic setting - recurring TTA where environments not only change but also recur over time, creating an extensive data stream. This set&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18193v4-abstract-full').style.display = 'inline'; document.getElementById('2311.18193v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18193v4-abstract-full" style="display: none;"> Current test-time adaptation (TTA) approaches aim to adapt a machine learning model to environments that change continuously. Yet, it is unclear whether TTA methods can maintain their adaptability over prolonged periods. To answer this question, we introduce a diagnostic setting - recurring TTA where environments not only change but also recur over time, creating an extensive data stream. This setting allows us to examine the error accumulation of TTA models, in the most basic scenario, when they are regularly exposed to previous testing environments. Furthermore, we simulate a TTA process on a simple yet representative $蔚$-perturbed Gaussian Mixture Model Classifier, deriving theoretical insights into the dataset- and algorithm-dependent factors contributing to gradual performance degradation. Our investigation leads us to propose persistent TTA (PeTTA), which senses when the model is diverging towards collapse and adjusts the adaptation strategy, striking a balance between the dual objectives of adaptation and model collapse prevention. The supreme stability of PeTTA over existing approaches, in the face of lifelong TTA scenarios, has been demonstrated over comprehensive experiments on various benchmarks. Our project page is available at https://hthieu166.github.io/petta. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18193v4-abstract-full').style.display = 'none'; document.getElementById('2311.18193v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15879">arXiv:2311.15879</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.15879">pdf</a>, <a href="https://arxiv.org/format/2311.15879">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EVCap: Retrieval-Augmented Image Captioning with External Visual-Name Memory for Open-World Comprehension </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaxuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Sugimoto%2C+A">Akihiro Sugimoto</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15879v2-abstract-short" style="display: inline;"> Large language models (LLMs)-based image captioning has the capability of describing objects not explicitly observed in training data; yet novel objects occur frequently, necessitating the requirement of sustaining up-to-date object knowledge for open-world comprehension. Instead of relying on large amounts of data and/or scaling up network parameters, we introduce a highly effective retrieval-aug&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15879v2-abstract-full').style.display = 'inline'; document.getElementById('2311.15879v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15879v2-abstract-full" style="display: none;"> Large language models (LLMs)-based image captioning has the capability of describing objects not explicitly observed in training data; yet novel objects occur frequently, necessitating the requirement of sustaining up-to-date object knowledge for open-world comprehension. Instead of relying on large amounts of data and/or scaling up network parameters, we introduce a highly effective retrieval-augmented image captioning method that prompts LLMs with object names retrieved from External Visual--name memory (EVCap). We build ever-changing object knowledge memory using objects&#39; visuals and names, enabling us to (i) update the memory at a minimal cost and (ii) effortlessly augment LLMs with retrieved object names by utilizing a lightweight and fast-to-train model. Our model, which was trained only on the COCO dataset, can adapt to out-of-domain without requiring additional fine-tuning or re-training. Our experiments conducted on benchmarks and synthetic commonsense-violating data show that EVCap, with only 3.97M trainable parameters, exhibits superior performance compared to other methods based on frozen pre-trained LLMs. Its performance is also competitive to specialist SOTAs that require extensive training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15879v2-abstract-full').style.display = 'none'; document.getElementById('2311.15879v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.12897">arXiv:2311.12897</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.12897">pdf</a>, <a href="https://arxiv.org/format/2311.12897">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> A Compact Dynamic 3D Gaussian Representation for Real-Time Dynamic View Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Katsumata%2C+K">Kai Katsumata</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.12897v2-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has shown remarkable success in synthesizing novel views given multiple views of a static scene. Yet, 3DGS faces challenges when applied to dynamic scenes because 3D Gaussian parameters need to be updated per timestep, requiring a large amount of memory and at least a dozen observations per timestep. To address these limitations, we present a compact dynamic 3D Gaussia&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12897v2-abstract-full').style.display = 'inline'; document.getElementById('2311.12897v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.12897v2-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has shown remarkable success in synthesizing novel views given multiple views of a static scene. Yet, 3DGS faces challenges when applied to dynamic scenes because 3D Gaussian parameters need to be updated per timestep, requiring a large amount of memory and at least a dozen observations per timestep. To address these limitations, we present a compact dynamic 3D Gaussian representation that models positions and rotations as functions of time with a few parameter approximations while keeping other properties of 3DGS including scale, color and opacity invariant. Our method can dramatically reduce memory usage and relax a strict multi-view assumption. In our experiments on monocular and multi-view scenarios, we show that our method not only matches state-of-the-art methods, often linked with slower rendering speeds, in terms of high rendering quality but also significantly surpasses them by achieving a rendering speed of $118$ frames per second (FPS) at a resolution of 1,352$\times$1,014 on a single GPU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12897v2-abstract-full').style.display = 'none'; document.getElementById('2311.12897v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 11 figures, ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.14602">arXiv:2310.14602</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.14602">pdf</a>, <a href="https://arxiv.org/ps/2310.14602">ps</a>, <a href="https://arxiv.org/format/2310.14602">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Generative Pre-trained Transformer for Vietnamese Community-based COVID-19 Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vo%2C+T+M">Tam Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+K+V">Khiem Vinh Tran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.14602v2-abstract-short" style="display: inline;"> Recent studies have provided empirical evidence of the wide-ranging potential of Generative Pre-trained Transformer (GPT), a pretrained language model, in the field of natural language processing. GPT has been effectively employed as a decoder within state-of-the-art (SOTA) question answering systems, yielding exceptional performance across various tasks. However, the current research landscape co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.14602v2-abstract-full').style.display = 'inline'; document.getElementById('2310.14602v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.14602v2-abstract-full" style="display: none;"> Recent studies have provided empirical evidence of the wide-ranging potential of Generative Pre-trained Transformer (GPT), a pretrained language model, in the field of natural language processing. GPT has been effectively employed as a decoder within state-of-the-art (SOTA) question answering systems, yielding exceptional performance across various tasks. However, the current research landscape concerning GPT&#39;s application in Vietnamese remains limited. This paper aims to address this gap by presenting an implementation of GPT-2 for community-based question answering specifically focused on COVID-19 related queries in Vietnamese. We introduce a novel approach by conducting a comparative analysis of different Transformers vs SOTA models in the community-based COVID-19 question answering dataset. The experimental findings demonstrate that the GPT-2 models exhibit highly promising outcomes, outperforming other SOTA models as well as previous community-based COVID-19 question answering models developed for Vietnamese. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.14602v2-abstract-full').style.display = 'none'; document.getElementById('2310.14602v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.10005">arXiv:2308.10005</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.10005">pdf</a>, <a href="https://arxiv.org/format/2308.10005">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Partition-and-Debias: Agnostic Biases Mitigation via A Mixture of Biases-Specific Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaxuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.10005v1-abstract-short" style="display: inline;"> Bias mitigation in image classification has been widely researched, and existing methods have yielded notable results. However, most of these methods implicitly assume that a given image contains only one type of known or unknown bias, failing to consider the complexities of real-world biases. We introduce a more challenging scenario, agnostic biases mitigation, aiming at bias removal regardless o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10005v1-abstract-full').style.display = 'inline'; document.getElementById('2308.10005v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.10005v1-abstract-full" style="display: none;"> Bias mitigation in image classification has been widely researched, and existing methods have yielded notable results. However, most of these methods implicitly assume that a given image contains only one type of known or unknown bias, failing to consider the complexities of real-world biases. We introduce a more challenging scenario, agnostic biases mitigation, aiming at bias removal regardless of whether the type of bias or the number of types is unknown in the datasets. To address this difficult task, we present the Partition-and-Debias (PnD) method that uses a mixture of biases-specific experts to implicitly divide the bias space into multiple subspaces and a gating module to find a consensus among experts to achieve debiased classification. Experiments on both public and constructed benchmarks demonstrated the efficacy of the PnD. Code is available at: https://github.com/Jiaxuan-Li/PnD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10005v1-abstract-full').style.display = 'none'; document.getElementById('2308.10005v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.08995">arXiv:2307.08995</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.08995">pdf</a>, <a href="https://arxiv.org/format/2307.08995">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Latent Space of GAN Inversion for Real Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Katsumata%2C+K">Kai Katsumata</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Bei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.08995v1-abstract-short" style="display: inline;"> The exploration of the latent space in StyleGANs and GAN inversion exemplify impressive real-world image editing, yet the trade-off between reconstruction quality and editing quality remains an open problem. In this study, we revisit StyleGANs&#39; hyperspherical prior $\mathcal{Z}$ and combine it with highly capable latent spaces to build combined spaces that faithfully invert real images while maint&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.08995v1-abstract-full').style.display = 'inline'; document.getElementById('2307.08995v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.08995v1-abstract-full" style="display: none;"> The exploration of the latent space in StyleGANs and GAN inversion exemplify impressive real-world image editing, yet the trade-off between reconstruction quality and editing quality remains an open problem. In this study, we revisit StyleGANs&#39; hyperspherical prior $\mathcal{Z}$ and combine it with highly capable latent spaces to build combined spaces that faithfully invert real images while maintaining the quality of edited images. More specifically, we propose $\mathcal{F}/\mathcal{Z}^{+}$ space consisting of two subspaces: $\mathcal{F}$ space of an intermediate feature map of StyleGANs enabling faithful reconstruction and $\mathcal{Z}^{+}$ space of an extended StyleGAN prior supporting high editing quality. We project the real images into the proposed space to obtain the inverted codes, by which we then move along $\mathcal{Z}^{+}$, enabling semantic editing without sacrificing image quality. Comprehensive experiments show that $\mathcal{Z}^{+}$ can replace the most commonly-used $\mathcal{W}$, $\mathcal{W}^{+}$, and $\mathcal{S}$ spaces while preserving reconstruction quality, resulting in reduced distortion of edited images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.08995v1-abstract-full').style.display = 'none'; document.getElementById('2307.08995v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 12 figures. arXiv admin note: substantial text overlap with arXiv:2306.00241</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.08319">arXiv:2307.08319</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.08319">pdf</a>, <a href="https://arxiv.org/format/2307.08319">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Soft Curriculum for Learning Conditional GANs with Noisy-Labeled and Uncurated Unlabeled Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Katsumata%2C+K">Kai Katsumata</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Harada%2C+T">Tatsuya Harada</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.08319v1-abstract-short" style="display: inline;"> Label-noise or curated unlabeled data is used to compensate for the assumption of clean labeled data in training the conditional generative adversarial network; however, satisfying such an extended assumption is occasionally laborious or impractical. As a step towards generative modeling accessible to everyone, we introduce a novel conditional image generation framework that accepts noisy-labeled&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.08319v1-abstract-full').style.display = 'inline'; document.getElementById('2307.08319v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.08319v1-abstract-full" style="display: none;"> Label-noise or curated unlabeled data is used to compensate for the assumption of clean labeled data in training the conditional generative adversarial network; however, satisfying such an extended assumption is occasionally laborious or impractical. As a step towards generative modeling accessible to everyone, we introduce a novel conditional image generation framework that accepts noisy-labeled and uncurated unlabeled data during training: (i) closed-set and open-set label noise in labeled data and (ii) closed-set and open-set unlabeled data. To combat it, we propose soft curriculum learning, which assigns instance-wise weights for adversarial training while assigning new labels for unlabeled data and correcting wrong labels for labeled data. Unlike popular curriculum learning, which uses a threshold to pick the training samples, our soft curriculum controls the effect of each training instance by using the weights predicted by the auxiliary classifier, resulting in the preservation of useful samples while ignoring harmful ones. Our experiments show that our approach outperforms existing semi-supervised and label-noise robust methods in terms of both quantitative and qualitative performance. In particular, the proposed approach is able to match the performance of (semi-) supervised GANs even with less than half the labeled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.08319v1-abstract-full').style.display = 'none'; document.getElementById('2307.08319v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.00241">arXiv:2306.00241</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.00241">pdf</a>, <a href="https://arxiv.org/format/2306.00241">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Balancing Reconstruction and Editing Quality of GAN Inversion for Real Image Editing with StyleGAN Prior Latent Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Katsumata%2C+K">Kai Katsumata</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Bei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.00241v1-abstract-short" style="display: inline;"> The exploration of the latent space in StyleGANs and GAN inversion exemplify impressive real-world image editing, yet the trade-off between reconstruction quality and editing quality remains an open problem. In this study, we revisit StyleGANs&#39; hyperspherical prior $\mathcal{Z}$ and $\mathcal{Z}^+$ and integrate them into seminal GAN inversion methods to improve editing quality. Besides faithful r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00241v1-abstract-full').style.display = 'inline'; document.getElementById('2306.00241v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.00241v1-abstract-full" style="display: none;"> The exploration of the latent space in StyleGANs and GAN inversion exemplify impressive real-world image editing, yet the trade-off between reconstruction quality and editing quality remains an open problem. In this study, we revisit StyleGANs&#39; hyperspherical prior $\mathcal{Z}$ and $\mathcal{Z}^+$ and integrate them into seminal GAN inversion methods to improve editing quality. Besides faithful reconstruction, our extensions achieve sophisticated editing quality with the aid of the StyleGAN prior. We project the real images into the proposed space to obtain the inverted codes, by which we then move along $\mathcal{Z}^{+}$, enabling semantic editing without sacrificing image quality. Comprehensive experiments show that $\mathcal{Z}^{+}$ can replace the most commonly-used $\mathcal{W}$, $\mathcal{W}^{+}$, and $\mathcal{S}$ spaces while preserving reconstruction quality, resulting in reduced distortion of edited images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00241v1-abstract-full').style.display = 'none'; document.getElementById('2306.00241v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 9 figures, AI4CC Workshop at CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.16487">arXiv:2305.16487</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.16487">pdf</a>, <a href="https://arxiv.org/format/2305.16487">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EgoHumans: An Egocentric 3D Multi-Human Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Khirodkar%2C+R">Rawal Khirodkar</a>, <a href="/search/cs?searchtype=author&amp;query=Bansal%2C+A">Aayush Bansal</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lingni Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Newcombe%2C+R">Richard Newcombe</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Kitani%2C+K">Kris Kitani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.16487v2-abstract-short" style="display: inline;"> We present EgoHumans, a new multi-view multi-human video benchmark to advance the state-of-the-art of egocentric human 3D pose estimation and tracking. Existing egocentric benchmarks either capture single subject or indoor-only scenarios, which limit the generalization of computer vision algorithms for real-world applications. We propose a novel 3D capture setup to construct a comprehensive egocen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.16487v2-abstract-full').style.display = 'inline'; document.getElementById('2305.16487v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.16487v2-abstract-full" style="display: none;"> We present EgoHumans, a new multi-view multi-human video benchmark to advance the state-of-the-art of egocentric human 3D pose estimation and tracking. Existing egocentric benchmarks either capture single subject or indoor-only scenarios, which limit the generalization of computer vision algorithms for real-world applications. We propose a novel 3D capture setup to construct a comprehensive egocentric multi-human benchmark in the wild with annotations to support diverse tasks such as human detection, tracking, 2D/3D pose estimation, and mesh recovery. We leverage consumer-grade wearable camera-equipped glasses for the egocentric view, which enables us to capture dynamic activities like playing tennis, fencing, volleyball, etc. Furthermore, our multi-view setup generates accurate 3D ground truth even under severe or complete occlusion. The dataset consists of more than 125k egocentric images, spanning diverse scenes with a particular focus on challenging and unchoreographed multi-human activities and fast-moving egocentric views. We rigorously evaluate existing state-of-the-art methods and highlight their limitations in the egocentric scenario, specifically on multi-human tracking. To address such limitations, we propose EgoFormer, a novel approach with a multi-stream transformer architecture and explicit 3D spatial reasoning to estimate and track the human pose. EgoFormer significantly outperforms prior art by 13.6% IDF1 on the EgoHumans dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.16487v2-abstract-full').style.display = 'none'; document.getElementById('2305.16487v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV 2023 (Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.06602">arXiv:2304.06602</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.06602">pdf</a>, <a href="https://arxiv.org/format/2304.06602">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A-CAP: Anticipation Captioning with Commonsense Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Luong%2C+Q">Quoc-An Luong</a>, <a href="/search/cs?searchtype=author&amp;query=Sugimoto%2C+A">Akihiro Sugimoto</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.06602v1-abstract-short" style="display: inline;"> Humans possess the capacity to reason about the future based on a sparse collection of visual cues acquired over time. In order to emulate this ability, we introduce a novel task called Anticipation Captioning, which generates a caption for an unseen oracle image using a sparsely temporally-ordered set of images. To tackle this new task, we propose a model called A-CAP, which incorporates commonse&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.06602v1-abstract-full').style.display = 'inline'; document.getElementById('2304.06602v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.06602v1-abstract-full" style="display: none;"> Humans possess the capacity to reason about the future based on a sparse collection of visual cues acquired over time. In order to emulate this ability, we introduce a novel task called Anticipation Captioning, which generates a caption for an unseen oracle image using a sparsely temporally-ordered set of images. To tackle this new task, we propose a model called A-CAP, which incorporates commonsense knowledge into a pre-trained vision-language model, allowing it to anticipate the caption. Through both qualitative and quantitative evaluations on a customized visual storytelling dataset, A-CAP outperforms other image captioning methods and establishes a strong baseline for anticipation captioning. We also address the challenges inherent in this task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.06602v1-abstract-full').style.display = 'none'; document.getElementById('2304.06602v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.13470">arXiv:2211.13470</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.13470">pdf</a>, <a href="https://arxiv.org/format/2211.13470">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient Zero-shot Visual Search via Target and Context-aware Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Z">Zhiwei Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+X">Xuezhe Ren</a>, <a href="/search/cs?searchtype=author&amp;query=David%2C+E">Erwan David</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Melissa Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Kreiman%2C+G">Gabriel Kreiman</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Mengmi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.13470v1-abstract-short" style="display: inline;"> Visual search is a ubiquitous challenge in natural vision, including daily tasks such as finding a friend in a crowd or searching for a car in a parking lot. Human rely heavily on relevant target features to perform goal-directed visual search. Meanwhile, context is of critical importance for locating a target object in complex scenes as it helps narrow down the search area and makes the search pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13470v1-abstract-full').style.display = 'inline'; document.getElementById('2211.13470v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.13470v1-abstract-full" style="display: none;"> Visual search is a ubiquitous challenge in natural vision, including daily tasks such as finding a friend in a crowd or searching for a car in a parking lot. Human rely heavily on relevant target features to perform goal-directed visual search. Meanwhile, context is of critical importance for locating a target object in complex scenes as it helps narrow down the search area and makes the search process more efficient. However, few works have combined both target and context information in visual search computational models. Here we propose a zero-shot deep learning architecture, TCT (Target and Context-aware Transformer), that modulates self attention in the Vision Transformer with target and contextual relevant information to enable human-like zero-shot visual search performance. Target modulation is computed as patch-wise local relevance between the target and search images, whereas contextual modulation is applied in a global fashion. We conduct visual search experiments on TCT and other competitive visual search models on three natural scene datasets with varying levels of difficulty. TCT demonstrates human-like performance in terms of search efficiency and beats the SOTA models in challenging visual search tasks. Importantly, TCT generalizes well across datasets with novel objects without retraining or fine-tuning. Furthermore, we also introduce a new dataset to benchmark models for invariant visual search under incongruent contexts. TCT manages to search flexibly via target and context modulation, even under incongruent contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13470v1-abstract-full').style.display = 'none'; document.getElementById('2211.13470v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.08459">arXiv:2210.08459</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.08459">pdf</a>, <a href="https://arxiv.org/format/2210.08459">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> StoryER: Automatic Story Evaluation via Ranking, Rating and Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Takamura%2C+H">Hiroya Takamura</a>, <a href="/search/cs?searchtype=author&amp;query=Miyao%2C+Y">Yusuke Miyao</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.08459v2-abstract-short" style="display: inline;"> Existing automatic story evaluation methods place a premium on story lexical level coherence, deviating from human preference. We go beyond this limitation by considering a novel \textbf{Story} \textbf{E}valuation method that mimics human preference when judging a story, namely \textbf{StoryER}, which consists of three sub-tasks: \textbf{R}anking, \textbf{R}ating and \textbf{R}easoning. Given eith&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08459v2-abstract-full').style.display = 'inline'; document.getElementById('2210.08459v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.08459v2-abstract-full" style="display: none;"> Existing automatic story evaluation methods place a premium on story lexical level coherence, deviating from human preference. We go beyond this limitation by considering a novel \textbf{Story} \textbf{E}valuation method that mimics human preference when judging a story, namely \textbf{StoryER}, which consists of three sub-tasks: \textbf{R}anking, \textbf{R}ating and \textbf{R}easoning. Given either a machine-generated or a human-written story, StoryER requires the machine to output 1) a preference score that corresponds to human preference, 2) specific ratings and their corresponding confidences and 3) comments for various aspects (e.g., opening, character-shaping). To support these tasks, we introduce a well-annotated dataset comprising (i) 100k ranked story pairs; and (ii) a set of 46k ratings and comments on various aspects of the story. We finetune Longformer-Encoder-Decoder (LED) on the collected dataset, with the encoder responsible for preference score and aspect prediction and the decoder for comment generation. Our comprehensive experiments result in a competitive benchmark for each task, showing the high correlation to human preference. In addition, we have witnessed the joint learning of the preference scores, the aspect ratings, and the comments brings gain in each single task. Our dataset and benchmarks are publicly available to advance the research of story evaluation tasks.\footnote{Dataset and pre-trained model demo are available at anonymous website \url{http://storytelling-lab.com/eval} and \url{https://github.com/sairin1202/StoryER}} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08459v2-abstract-full').style.display = 'none'; document.getElementById('2210.08459v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by EMNLP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.07629">arXiv:2209.07629</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.07629">pdf</a>, <a href="https://arxiv.org/format/2209.07629">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Self-Relation Attention and Temporal Awareness for Emotion Recognition via Vocal Burst </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Trinh%2C+D">Dang-Linh Trinh</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh-Cong Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+G">Guee-Sang Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.07629v2-abstract-short" style="display: inline;"> The technical report presents our emotion recognition pipeline for high-dimensional emotion task (A-VB High) in The ACII Affective Vocal Bursts (A-VB) 2022 Workshop \&amp; Competition. Our proposed method contains three stages. Firstly, we extract the latent features from the raw audio signal and its Mel-spectrogram by self-supervised learning methods. Then, the features from the raw signal are fed to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.07629v2-abstract-full').style.display = 'inline'; document.getElementById('2209.07629v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.07629v2-abstract-full" style="display: none;"> The technical report presents our emotion recognition pipeline for high-dimensional emotion task (A-VB High) in The ACII Affective Vocal Bursts (A-VB) 2022 Workshop \&amp; Competition. Our proposed method contains three stages. Firstly, we extract the latent features from the raw audio signal and its Mel-spectrogram by self-supervised learning methods. Then, the features from the raw signal are fed to the self-relation attention and temporal awareness (SA-TA) module for learning the valuable information between these latent features. Finally, we concatenate all the features and utilize a fully-connected layer to predict each emotion&#39;s score. By empirical experiments, our proposed method achieves a mean concordance correlation coefficient (CCC) of 0.7295 on the test set, compared to 0.5686 on the baseline model. The code of our method is available at https://github.com/linhtd812/A-VB2022. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.07629v2-abstract-full').style.display = 'none'; document.getElementById('2209.07629v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.04794">arXiv:2209.04794</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.04794">pdf</a>, <a href="https://arxiv.org/format/2209.04794">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1371/journal.pone.0276545">10.1371/journal.pone.0276545 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning to diagnose common thorax diseases on chest radiographs from radiology reports in Vietnamese </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+T+B">Thao T. B. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+T+M">Tam M. Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+V">Thang V. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+H+H">Hieu H. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+H+Q">Ha Q. Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.04794v1-abstract-short" style="display: inline;"> We propose a data collecting and annotation pipeline that extracts information from Vietnamese radiology reports to provide accurate labels for chest X-ray (CXR) images. This can benefit Vietnamese radiologists and clinicians by annotating data that closely match their endemic diagnosis categories which may vary from country to country. To assess the efficacy of the proposed labeling technique, we&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.04794v1-abstract-full').style.display = 'inline'; document.getElementById('2209.04794v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.04794v1-abstract-full" style="display: none;"> We propose a data collecting and annotation pipeline that extracts information from Vietnamese radiology reports to provide accurate labels for chest X-ray (CXR) images. This can benefit Vietnamese radiologists and clinicians by annotating data that closely match their endemic diagnosis categories which may vary from country to country. To assess the efficacy of the proposed labeling technique, we built a CXR dataset containing 9,752 studies and evaluated our pipeline using a subset of this dataset. With an F1-score of at least 0.9923, the evaluation demonstrates that our labeling tool performs precisely and consistently across all classes. After building the dataset, we train deep learning models that leverage knowledge transferred from large public CXR datasets. We employ a variety of loss functions to overcome the curse of imbalanced multi-label datasets and conduct experiments with various model architectures to select the one that delivers the best performance. Our best model (CheXpert-pretrained EfficientNet-B2) yields an F1-score of 0.6989 (95% CI 0.6740, 0.7240), AUC of 0.7912, sensitivity of 0.7064 and specificity of 0.8760 for the abnormal diagnosis in general. Finally, we demonstrate that our coarse classification (based on five specific locations of abnormalities) yields comparable results to fine classification (twelve pathologies) on the benchmark CheXpert dataset for general anomaly detection while delivering better performance in terms of the average performance of all classes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.04794v1-abstract-full').style.display = 'none'; document.getElementById('2209.04794v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been provisionally accepted for publication by Plos One journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.05249">arXiv:2207.05249</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.05249">pdf</a>, <a href="https://arxiv.org/format/2207.05249">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Human Vision Inspired Action Recognition using Adaptive Spatiotemporal Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mac%2C+K+C">Khoi-Nguyen C. Mac</a>, <a href="/search/cs?searchtype=author&amp;query=Do%2C+M+N">Minh N. Do</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M+P">Minh P. Vo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.05249v3-abstract-short" style="display: inline;"> Adaptive sampling that exploits the spatiotemporal redundancy in videos is critical for always-on action recognition on wearable devices with limited computing and battery resources. The commonly used fixed sampling strategy is not context-aware and may under-sample the visual content, and thus adversely impacts both computation efficiency and accuracy. Inspired by the concepts of foveal vision an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05249v3-abstract-full').style.display = 'inline'; document.getElementById('2207.05249v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.05249v3-abstract-full" style="display: none;"> Adaptive sampling that exploits the spatiotemporal redundancy in videos is critical for always-on action recognition on wearable devices with limited computing and battery resources. The commonly used fixed sampling strategy is not context-aware and may under-sample the visual content, and thus adversely impacts both computation efficiency and accuracy. Inspired by the concepts of foveal vision and pre-attentive processing from the human visual perception mechanism, we introduce a novel adaptive spatiotemporal sampling scheme for efficient action recognition. Our system pre-scans the global scene context at low-resolution and decides to skip or request high-resolution features at salient regions for further processing. We validate the system on EPIC-KITCHENS and UCF-101 datasets for action recognition, and show that our proposed approach can greatly speed up inference with a tolerable loss of accuracy compared with those from state-of-the-art baselines. Source code is available in https://github.com/knmac/adaptive_spatiotemporal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05249v3-abstract-full').style.display = 'none'; document.getElementById('2207.05249v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.04320">arXiv:2207.04320</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.04320">pdf</a>, <a href="https://arxiv.org/format/2207.04320">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Snipper: A Spatiotemporal Transformer for Simultaneous Multi-Person 3D Pose Estimation Tracking and Forecasting on a Video Snippet </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zou%2C+S">Shihao Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yuanlu Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lingni Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+L">Li Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.04320v3-abstract-short" style="display: inline;"> Multi-person pose understanding from RGB videos involves three complex tasks: pose estimation, tracking and motion forecasting. Intuitively, accurate multi-person pose estimation facilitates robust tracking, and robust tracking builds crucial history for correct motion forecasting. Most existing works either focus on a single task or employ multi-stage approaches to solving multiple tasks separate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.04320v3-abstract-full').style.display = 'inline'; document.getElementById('2207.04320v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.04320v3-abstract-full" style="display: none;"> Multi-person pose understanding from RGB videos involves three complex tasks: pose estimation, tracking and motion forecasting. Intuitively, accurate multi-person pose estimation facilitates robust tracking, and robust tracking builds crucial history for correct motion forecasting. Most existing works either focus on a single task or employ multi-stage approaches to solving multiple tasks separately, which tends to make sub-optimal decision at each stage and also fail to exploit correlations among the three tasks. In this paper, we propose Snipper, a unified framework to perform multi-person 3D pose estimation, tracking, and motion forecasting simultaneously in a single stage. We propose an efficient yet powerful deformable attention mechanism to aggregate spatiotemporal information from the video snippet. Building upon this deformable attention, a video transformer is learned to encode the spatiotemporal features from the multi-frame snippet and to decode informative pose features for multi-person pose queries. Finally, these pose queries are regressed to predict multi-person pose trajectories and future motions in a single shot. In the experiments, we show the effectiveness of Snipper on three challenging public datasets where our generic model rivals specialized state-of-art baselines for pose estimation, tracking, and forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.04320v3-abstract-full').style.display = 'none'; document.getElementById('2207.04320v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.08929">arXiv:2206.08929</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.08929">pdf</a>, <a href="https://arxiv.org/format/2206.08929">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TAVA: Template-free Animatable Volumetric Actors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Ruilong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tanke%2C+J">Julian Tanke</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Zollhofer%2C+M">Michael Zollhofer</a>, <a href="/search/cs?searchtype=author&amp;query=Gall%2C+J">Jurgen Gall</a>, <a href="/search/cs?searchtype=author&amp;query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&amp;query=Lassner%2C+C">Christoph Lassner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.08929v2-abstract-short" style="display: inline;"> Coordinate-based volumetric representations have the potential to generate photo-realistic virtual avatars from images. However, virtual avatars also need to be controllable even to a novel pose that may not have been observed. Traditional techniques, such as LBS, provide such a function; yet it usually requires a hand-designed body template, 3D scan data, and limited appearance models. On the oth&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08929v2-abstract-full').style.display = 'inline'; document.getElementById('2206.08929v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.08929v2-abstract-full" style="display: none;"> Coordinate-based volumetric representations have the potential to generate photo-realistic virtual avatars from images. However, virtual avatars also need to be controllable even to a novel pose that may not have been observed. Traditional techniques, such as LBS, provide such a function; yet it usually requires a hand-designed body template, 3D scan data, and limited appearance models. On the other hand, neural representation has been shown to be powerful in representing visual details, but are under explored on deforming dynamic articulated actors. In this paper, we propose TAVA, a method to create T emplate-free Animatable Volumetric Actors, based on neural representations. We rely solely on multi-view data and a tracked skeleton to create a volumetric model of an actor, which can be animated at the test time given novel pose. Since TAVA does not require a body template, it is applicable to humans as well as other creatures such as animals. Furthermore, TAVA is designed such that it can recover accurate dense correspondences, making it amenable to content-creation and editing tasks. Through extensive experiments, we demonstrate that the proposed method generalizes well to novel poses as well as unseen views and showcase basic editing capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08929v2-abstract-full').style.display = 'none'; document.getElementById('2206.08929v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/facebookresearch/tava; Project Website: https://www.liruilong.cn/projects/tava/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.14249">arXiv:2204.14249</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.14249">pdf</a>, <a href="https://arxiv.org/format/2204.14249">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OSSGAN: Open-Set Semi-Supervised Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Katsumata%2C+K">Kai Katsumata</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.14249v1-abstract-short" style="display: inline;"> We introduce a challenging training scheme of conditional GANs, called open-set semi-supervised image generation, where the training dataset consists of two parts: (i) labeled data and (ii) unlabeled data with samples belonging to one of the labeled data classes, namely, a closed-set, and samples not belonging to any of the labeled data classes, namely, an open-set. Unlike the existing semi-superv&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.14249v1-abstract-full').style.display = 'inline'; document.getElementById('2204.14249v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.14249v1-abstract-full" style="display: none;"> We introduce a challenging training scheme of conditional GANs, called open-set semi-supervised image generation, where the training dataset consists of two parts: (i) labeled data and (ii) unlabeled data with samples belonging to one of the labeled data classes, namely, a closed-set, and samples not belonging to any of the labeled data classes, namely, an open-set. Unlike the existing semi-supervised image generation task, where unlabeled data only contain closed-set samples, our task is more general and lowers the data collection cost in practice by allowing open-set samples to appear. Thanks to entropy regularization, the classifier that is trained on labeled data is able to quantify sample-wise importance to the training of cGAN as confidence, allowing us to use all samples in unlabeled data. We design OSSGAN, which provides decision clues to the discriminator on the basis of whether an unlabeled image belongs to one or none of the classes of interest, smoothly integrating labeled and unlabeled data during training. The results of experiments on Tiny ImageNet and ImageNet show notable improvements over supervised BigGAN and semi-supervised methods. Our code is available at https://github.com/raven38/OSSGAN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.14249v1-abstract-full').style.display = 'none'; document.getElementById('2204.14249v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CVPR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.01695">arXiv:2204.01695</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.01695">pdf</a>, <a href="https://arxiv.org/format/2204.01695">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LISA: Learning Implicit Shape and Appearance of Hands </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Corona%2C+E">Enric Corona</a>, <a href="/search/cs?searchtype=author&amp;query=Hodan%2C+T">Tomas Hodan</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Moreno-Noguer%2C+F">Francesc Moreno-Noguer</a>, <a href="/search/cs?searchtype=author&amp;query=Sweeney%2C+C">Chris Sweeney</a>, <a href="/search/cs?searchtype=author&amp;query=Newcombe%2C+R">Richard Newcombe</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lingni Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.01695v1-abstract-short" style="display: inline;"> This paper proposes a do-it-all neural model of human hands, named LISA. The model can capture accurate hand shape and appearance, generalize to arbitrary hand subjects, provide dense surface correspondences, be reconstructed from images in the wild and easily animated. We train LISA by minimizing the shape and appearance losses on a large set of multi-view RGB image sequences annotated with coars&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.01695v1-abstract-full').style.display = 'inline'; document.getElementById('2204.01695v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.01695v1-abstract-full" style="display: none;"> This paper proposes a do-it-all neural model of human hands, named LISA. The model can capture accurate hand shape and appearance, generalize to arbitrary hand subjects, provide dense surface correspondences, be reconstructed from images in the wild and easily animated. We train LISA by minimizing the shape and appearance losses on a large set of multi-view RGB image sequences annotated with coarse 3D poses of the hand skeleton. For a 3D point in the hand local coordinate, our model predicts the color and the signed distance with respect to each hand bone independently, and then combines the per-bone predictions using predicted skinning weights. The shape, color and pose representations are disentangled by design, allowing to estimate or animate only selected parameters. We experimentally demonstrate that LISA can accurately reconstruct a dynamic hand from monocular or multi-view sequences, achieving a noticeably higher quality of reconstructed hand shapes compared to baseline approaches. Project page: https://www.iri.upc.edu/people/ecorona/lisa/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.01695v1-abstract-full').style.display = 'none'; document.getElementById('2204.01695v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at CVPR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.14499">arXiv:2203.14499</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.14499">pdf</a>, <a href="https://arxiv.org/format/2203.14499">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NOC-REK: Novel Object Captioning with Retrieved Vocabulary from External Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Sugimoto%2C+A">Akihiro Sugimoto</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.14499v1-abstract-short" style="display: inline;"> Novel object captioning aims at describing objects absent from training data, with the key ingredient being the provision of object vocabulary to the model. Although existing methods heavily rely on an object detection model, we view the detection step as vocabulary retrieval from an external knowledge in the form of embeddings for any object&#39;s definition from Wiktionary, where we use in the retri&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14499v1-abstract-full').style.display = 'inline'; document.getElementById('2203.14499v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.14499v1-abstract-full" style="display: none;"> Novel object captioning aims at describing objects absent from training data, with the key ingredient being the provision of object vocabulary to the model. Although existing methods heavily rely on an object detection model, we view the detection step as vocabulary retrieval from an external knowledge in the form of embeddings for any object&#39;s definition from Wiktionary, where we use in the retrieval image region features learned from a transformers model. We propose an end-to-end Novel Object Captioning with Retrieved vocabulary from External Knowledge method (NOC-REK), which simultaneously learns vocabulary retrieval and caption generation, successfully describing novel objects outside of the training dataset. Furthermore, our model eliminates the requirement for model retraining by simply updating the external knowledge whenever a novel object appears. Our comprehensive experiments on held-out COCO and Nocaps datasets show that our NOC-REK is considerably effective against SOTAs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14499v1-abstract-full').style.display = 'none'; document.getElementById('2203.14499v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CVPR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.08456">arXiv:2203.08456</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.08456">pdf</a>, <a href="https://arxiv.org/format/2203.08456">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PPCD-GAN: Progressive Pruning and Class-Aware Distillation for Large-Scale Conditional GANs Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Sugimoto%2C+A">Akihiro Sugimoto</a>, <a href="/search/cs?searchtype=author&amp;query=Nakayama%2C+H">Hideki Nakayama</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.08456v1-abstract-short" style="display: inline;"> We push forward neural network compression research by exploiting a novel challenging task of large-scale conditional generative adversarial networks (GANs) compression. To this end, we propose a gradually shrinking GAN (PPCD-GAN) by introducing progressive pruning residual block (PP-Res) and class-aware distillation. The PP-Res is an extension of the conventional residual block where each convolu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.08456v1-abstract-full').style.display = 'inline'; document.getElementById('2203.08456v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.08456v1-abstract-full" style="display: none;"> We push forward neural network compression research by exploiting a novel challenging task of large-scale conditional generative adversarial networks (GANs) compression. To this end, we propose a gradually shrinking GAN (PPCD-GAN) by introducing progressive pruning residual block (PP-Res) and class-aware distillation. The PP-Res is an extension of the conventional residual block where each convolutional layer is followed by a learnable mask layer to progressively prune network parameters as training proceeds. The class-aware distillation, on the other hand, enhances the stability of training by transferring immense knowledge from a well-trained teacher model through instructive attention maps. We train the pruning and distillation processes simultaneously on a well-known GAN architecture in an end-to-end manner. After training, all redundant parameters as well as the mask layers are discarded, yielding a lighter network while retaining the performance. We comprehensively illustrate, on ImageNet 128x128 dataset, PPCD-GAN reduces up to 5.2x (81%) parameters against state-of-the-arts while keeping better performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.08456v1-abstract-full').style.display = 'none'; document.getElementById('2203.08456v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at WACV 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.10753">arXiv:2202.10753</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.10753">pdf</a>, <a href="https://arxiv.org/format/2202.10753">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> </div> </div> <p class="title is-5 mathjax"> Convolutional Neural Network Modelling for MODIS Land Surface Temperature Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+B+M">Binh Minh Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+G">Ganglin Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh-Triet Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Michel%2C+A">Aur茅lie Michel</a>, <a href="/search/cs?searchtype=author&amp;query=Corpetti%2C+T">Thomas Corpetti</a>, <a href="/search/cs?searchtype=author&amp;query=Granero-Belinchon%2C+C">Carlos Granero-Belinchon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.10753v2-abstract-short" style="display: inline;"> Nowadays, thermal infrared satellite remote sensors enable to extract very interesting information at large scale, in particular Land Surface Temperature (LST). However such data are limited in spatial and/or temporal resolutions which prevents from an analysis at fine scales. For example, MODIS satellite provides daily acquisitions with 1Km spatial resolutions which is not sufficient to deal with&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.10753v2-abstract-full').style.display = 'inline'; document.getElementById('2202.10753v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.10753v2-abstract-full" style="display: none;"> Nowadays, thermal infrared satellite remote sensors enable to extract very interesting information at large scale, in particular Land Surface Temperature (LST). However such data are limited in spatial and/or temporal resolutions which prevents from an analysis at fine scales. For example, MODIS satellite provides daily acquisitions with 1Km spatial resolutions which is not sufficient to deal with highly heterogeneous environments as agricultural parcels. Therefore, image super-resolution is a crucial task to better exploit MODIS LSTs. This issue is tackled in this paper. We introduce a deep learning-based algorithm, named Multi-residual U-Net, for super-resolution of MODIS LST single-images. Our proposed network is a modified version of U-Net architecture, which aims at super-resolving the input LST image from 1Km to 250m per pixel. The results show that our Multi-residual U-Net outperforms other state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.10753v2-abstract-full').style.display = 'none'; document.getElementById('2202.10753v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.12761">arXiv:2112.12761</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2112.12761">pdf</a>, <a href="https://arxiv.org/format/2112.12761">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> BANMo: Building Animatable 3D Neural Models from Many Casual Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+G">Gengshan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Neverova%2C+N">Natalia Neverova</a>, <a href="/search/cs?searchtype=author&amp;query=Ramanan%2C+D">Deva Ramanan</a>, <a href="/search/cs?searchtype=author&amp;query=Vedaldi%2C+A">Andrea Vedaldi</a>, <a href="/search/cs?searchtype=author&amp;query=Joo%2C+H">Hanbyul Joo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.12761v3-abstract-short" style="display: inline;"> Prior work for articulated 3D shape reconstruction often relies on specialized sensors (e.g., synchronized multi-camera systems), or pre-built 3D deformable models (e.g., SMAL or SMPL). Such methods are not able to scale to diverse sets of objects in the wild. We present BANMo, a method that requires neither a specialized sensor nor a pre-defined template shape. BANMo builds high-fidelity, articul&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.12761v3-abstract-full').style.display = 'inline'; document.getElementById('2112.12761v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.12761v3-abstract-full" style="display: none;"> Prior work for articulated 3D shape reconstruction often relies on specialized sensors (e.g., synchronized multi-camera systems), or pre-built 3D deformable models (e.g., SMAL or SMPL). Such methods are not able to scale to diverse sets of objects in the wild. We present BANMo, a method that requires neither a specialized sensor nor a pre-defined template shape. BANMo builds high-fidelity, articulated 3D models (including shape and animatable skinning weights) from many monocular casual videos in a differentiable rendering framework. While the use of many videos provides more coverage of camera views and object articulations, they introduce significant challenges in establishing correspondence across scenes with different backgrounds, illumination conditions, etc. Our key insight is to merge three schools of thought; (1) classic deformable shape models that make use of articulated bones and blend skinning, (2) volumetric neural radiance fields (NeRFs) that are amenable to gradient-based optimization, and (3) canonical embeddings that generate correspondences between pixels and an articulated model. We introduce neural blend skinning models that allow for differentiable and invertible articulated deformations. When combined with canonical embeddings, such models allow us to establish dense correspondences across videos that can be self-supervised with cycle consistency. On real and synthetic datasets, BANMo shows higher-fidelity 3D reconstructions than prior works for humans and animals, with the ability to render realistic images from novel viewpoints and poses. Project webpage: banmo-www.github.io . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.12761v3-abstract-full').style.display = 'none'; document.getElementById('2112.12761v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2022 camera-ready version (last update: May 2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.07058">arXiv:2110.07058</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.07058">pdf</a>, <a href="https://arxiv.org/format/2110.07058">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Ego4D: Around the World in 3,000 Hours of Egocentric Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Grauman%2C+K">Kristen Grauman</a>, <a href="/search/cs?searchtype=author&amp;query=Westbury%2C+A">Andrew Westbury</a>, <a href="/search/cs?searchtype=author&amp;query=Byrne%2C+E">Eugene Byrne</a>, <a href="/search/cs?searchtype=author&amp;query=Chavis%2C+Z">Zachary Chavis</a>, <a href="/search/cs?searchtype=author&amp;query=Furnari%2C+A">Antonino Furnari</a>, <a href="/search/cs?searchtype=author&amp;query=Girdhar%2C+R">Rohit Girdhar</a>, <a href="/search/cs?searchtype=author&amp;query=Hamburger%2C+J">Jackson Hamburger</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Miao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xingyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Martin%2C+M">Miguel Martin</a>, <a href="/search/cs?searchtype=author&amp;query=Nagarajan%2C+T">Tushar Nagarajan</a>, <a href="/search/cs?searchtype=author&amp;query=Radosavovic%2C+I">Ilija Radosavovic</a>, <a href="/search/cs?searchtype=author&amp;query=Ramakrishnan%2C+S+K">Santhosh Kumar Ramakrishnan</a>, <a href="/search/cs?searchtype=author&amp;query=Ryan%2C+F">Fiona Ryan</a>, <a href="/search/cs?searchtype=author&amp;query=Sharma%2C+J">Jayant Sharma</a>, <a href="/search/cs?searchtype=author&amp;query=Wray%2C+M">Michael Wray</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Mengmeng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+E+Z">Eric Zhongcong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Bansal%2C+S">Siddhant Bansal</a>, <a href="/search/cs?searchtype=author&amp;query=Batra%2C+D">Dhruv Batra</a>, <a href="/search/cs?searchtype=author&amp;query=Cartillier%2C+V">Vincent Cartillier</a>, <a href="/search/cs?searchtype=author&amp;query=Crane%2C+S">Sean Crane</a>, <a href="/search/cs?searchtype=author&amp;query=Do%2C+T">Tien Do</a> , et al. (60 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.07058v3-abstract-short" style="display: inline;"> We introduce Ego4D, a massive-scale egocentric video dataset and benchmark suite. It offers 3,670 hours of daily-life activity video spanning hundreds of scenarios (household, outdoor, workplace, leisure, etc.) captured by 931 unique camera wearers from 74 worldwide locations and 9 different countries. The approach to collection is designed to uphold rigorous privacy and ethics standards with cons&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.07058v3-abstract-full').style.display = 'inline'; document.getElementById('2110.07058v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.07058v3-abstract-full" style="display: none;"> We introduce Ego4D, a massive-scale egocentric video dataset and benchmark suite. It offers 3,670 hours of daily-life activity video spanning hundreds of scenarios (household, outdoor, workplace, leisure, etc.) captured by 931 unique camera wearers from 74 worldwide locations and 9 different countries. The approach to collection is designed to uphold rigorous privacy and ethics standards with consenting participants and robust de-identification procedures where relevant. Ego4D dramatically expands the volume of diverse egocentric video footage publicly available to the research community. Portions of the video are accompanied by audio, 3D meshes of the environment, eye gaze, stereo, and/or synchronized videos from multiple egocentric cameras at the same event. Furthermore, we present a host of new benchmark challenges centered around understanding the first-person visual experience in the past (querying an episodic memory), present (analyzing hand-object manipulation, audio-visual conversation, and social interactions), and future (forecasting activities). By publicly sharing this massive annotated dataset and benchmark suite, we aim to push the frontier of first-person perception. Project page: https://ego4d-data.org/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.07058v3-abstract-full').style.display = 'none'; document.getElementById('2110.07058v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in the Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2022. This version updates the baseline result numbers for the Hands and Objects benchmark (appendix)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.10165">arXiv:2108.10165</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.10165">pdf</a>, <a href="https://arxiv.org/format/2108.10165">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ODAM: Object Detection, Association, and Mapping using Posed RGB Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kejie Li</a>, <a href="/search/cs?searchtype=author&amp;query=DeTone%2C+D">Daniel DeTone</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Steven Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Reid%2C+I">Ian Reid</a>, <a href="/search/cs?searchtype=author&amp;query=Rezatofighi%2C+H">Hamid Rezatofighi</a>, <a href="/search/cs?searchtype=author&amp;query=Sweeney%2C+C">Chris Sweeney</a>, <a href="/search/cs?searchtype=author&amp;query=Straub%2C+J">Julian Straub</a>, <a href="/search/cs?searchtype=author&amp;query=Newcombe%2C+R">Richard Newcombe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.10165v1-abstract-short" style="display: inline;"> Localizing objects and estimating their extent in 3D is an important step towards high-level 3D scene understanding, which has many applications in Augmented Reality and Robotics. We present ODAM, a system for 3D Object Detection, Association, and Mapping using posed RGB videos. The proposed system relies on a deep learning front-end to detect 3D objects from a given RGB frame and associate them t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.10165v1-abstract-full').style.display = 'inline'; document.getElementById('2108.10165v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.10165v1-abstract-full" style="display: none;"> Localizing objects and estimating their extent in 3D is an important step towards high-level 3D scene understanding, which has many applications in Augmented Reality and Robotics. We present ODAM, a system for 3D Object Detection, Association, and Mapping using posed RGB videos. The proposed system relies on a deep learning front-end to detect 3D objects from a given RGB frame and associate them to a global object-based map using a graph neural network (GNN). Based on these frame-to-model associations, our back-end optimizes object bounding volumes, represented as super-quadrics, under multi-view geometry constraints and the object scale prior. We validate the proposed system on ScanNet where we show a significant improvement over existing RGB-only methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.10165v1-abstract-full').style.display = 'none'; document.getElementById('2108.10165v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ICCV 2021 as oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.07267">arXiv:2104.07267</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.07267">pdf</a>, <a href="https://arxiv.org/format/2104.07267">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ContactOpt: Optimizing Contact to Improve Grasps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Grady%2C+P">Patrick Grady</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chengcheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Twigg%2C+C+D">Christopher D. Twigg</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Brahmbhatt%2C+S">Samarth Brahmbhatt</a>, <a href="/search/cs?searchtype=author&amp;query=Kemp%2C+C+C">Charles C. Kemp</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.07267v1-abstract-short" style="display: inline;"> Physical contact between hands and objects plays a critical role in human grasps. We show that optimizing the pose of a hand to achieve expected contact with an object can improve hand poses inferred via image-based methods. Given a hand mesh and an object mesh, a deep model trained on ground truth contact data infers desirable contact across the surfaces of the meshes. Then, ContactOpt efficientl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.07267v1-abstract-full').style.display = 'inline'; document.getElementById('2104.07267v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.07267v1-abstract-full" style="display: none;"> Physical contact between hands and objects plays a critical role in human grasps. We show that optimizing the pose of a hand to achieve expected contact with an object can improve hand poses inferred via image-based methods. Given a hand mesh and an object mesh, a deep model trained on ground truth contact data infers desirable contact across the surfaces of the meshes. Then, ContactOpt efficiently optimizes the pose of the hand to achieve desirable contact using a differentiable contact model. Notably, our contact model encourages mesh interpenetration to approximate deformable soft tissue in the hand. In our evaluations, our methods result in grasps that better match ground truth contact, have lower kinematic error, and are significantly preferred by human participants. Code and models are available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.07267v1-abstract-full').style.display = 'none'; document.getElementById('2104.07267v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Conference on Computer Vision and Pattern Recognition (CVPR) 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.12890">arXiv:2012.12890</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2012.12890">pdf</a>, <a href="https://arxiv.org/format/2012.12890">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ANR: Articulated Neural Rendering for Virtual Avatars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Raj%2C+A">Amit Raj</a>, <a href="/search/cs?searchtype=author&amp;query=Tanke%2C+J">Julian Tanke</a>, <a href="/search/cs?searchtype=author&amp;query=Hays%2C+J">James Hays</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Stoll%2C+C">Carsten Stoll</a>, <a href="/search/cs?searchtype=author&amp;query=Lassner%2C+C">Christoph Lassner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.12890v1-abstract-short" style="display: inline;"> The combination of traditional rendering with neural networks in Deferred Neural Rendering (DNR) provides a compelling balance between computational complexity and realism of the resulting images. Using skinned meshes for rendering articulating objects is a natural extension for the DNR framework and would open it up to a plethora of applications. However, in this case the neural shading step must&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.12890v1-abstract-full').style.display = 'inline'; document.getElementById('2012.12890v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.12890v1-abstract-full" style="display: none;"> The combination of traditional rendering with neural networks in Deferred Neural Rendering (DNR) provides a compelling balance between computational complexity and realism of the resulting images. Using skinned meshes for rendering articulating objects is a natural extension for the DNR framework and would open it up to a plethora of applications. However, in this case the neural shading step must account for deformations that are possibly not captured in the mesh, as well as alignment inaccuracies and dynamics -- which can confound the DNR pipeline. We present Articulated Neural Rendering (ANR), a novel framework based on DNR which explicitly addresses its limitations for virtual human avatars. We show the superiority of ANR not only with respect to DNR but also with methods specialized for avatar creation and animation. In two user studies, we observe a clear preference for our avatar model and we demonstrate state-of-the-art performance on quantitative evaluation metrics. Perceptually, we observe better temporal stability, level of detail and plausibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.12890v1-abstract-full').style.display = 'none'; document.getElementById('2012.12890v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.00158">arXiv:2008.00158</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2008.00158">pdf</a>, <a href="https://arxiv.org/ps/2008.00158">ps</a>, <a href="https://arxiv.org/format/2008.00158">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TexMesh: Reconstructing Detailed Human Texture and Geometry from RGB-D Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhi%2C+T">Tiancheng Zhi</a>, <a href="/search/cs?searchtype=author&amp;query=Lassner%2C+C">Christoph Lassner</a>, <a href="/search/cs?searchtype=author&amp;query=Tung%2C+T">Tony Tung</a>, <a href="/search/cs?searchtype=author&amp;query=Stoll%2C+C">Carsten Stoll</a>, <a href="/search/cs?searchtype=author&amp;query=Narasimhan%2C+S+G">Srinivasa G. Narasimhan</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.00158v3-abstract-short" style="display: inline;"> We present TexMesh, a novel approach to reconstruct detailed human meshes with high-resolution full-body texture from RGB-D video. TexMesh enables high quality free-viewpoint rendering of humans. Given the RGB frames, the captured environment map, and the coarse per-frame human mesh from RGB-D tracking, our method reconstructs spatiotemporally consistent and detailed per-frame meshes along with a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.00158v3-abstract-full').style.display = 'inline'; document.getElementById('2008.00158v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.00158v3-abstract-full" style="display: none;"> We present TexMesh, a novel approach to reconstruct detailed human meshes with high-resolution full-body texture from RGB-D video. TexMesh enables high quality free-viewpoint rendering of humans. Given the RGB frames, the captured environment map, and the coarse per-frame human mesh from RGB-D tracking, our method reconstructs spatiotemporally consistent and detailed per-frame meshes along with a high-resolution albedo texture. By using the incident illumination we are able to accurately estimate local surface geometry and albedo, which allows us to further use photometric constraints to adapt a synthetically trained model to real-world sequences in a self-supervised manner for detailed surface geometry and high-resolution texture estimation. In practice, we train our models on a short example sequence for self-adaptation and the model runs at interactive framerate afterwards. We validate TexMesh on synthetic and real-world data, and show it outperforms the state of art quantitatively and qualitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.00158v3-abstract-full').style.display = 'none'; document.getElementById('2008.00158v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.12806">arXiv:2007.12806</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.12806">pdf</a>, <a href="https://arxiv.org/format/2007.12806">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spatiotemporal Bundle Adjustment for Dynamic 3D Human Reconstruction in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Sheikh%2C+Y">Yaser Sheikh</a>, <a href="/search/cs?searchtype=author&amp;query=Narasimhan%2C+S+G">Srinivasa G. Narasimhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.12806v1-abstract-short" style="display: inline;"> Bundle adjustment jointly optimizes camera intrinsics and extrinsics and 3D point triangulation to reconstruct a static scene. The triangulation constraint, however, is invalid for moving points captured in multiple unsynchronized videos and bundle adjustment is not designed to estimate the temporal alignment between cameras. We present a spatiotemporal bundle adjustment framework that jointly opt&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.12806v1-abstract-full').style.display = 'inline'; document.getElementById('2007.12806v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.12806v1-abstract-full" style="display: none;"> Bundle adjustment jointly optimizes camera intrinsics and extrinsics and 3D point triangulation to reconstruct a static scene. The triangulation constraint, however, is invalid for moving points captured in multiple unsynchronized videos and bundle adjustment is not designed to estimate the temporal alignment between cameras. We present a spatiotemporal bundle adjustment framework that jointly optimizes four coupled sub-problems: estimating camera intrinsics and extrinsics, triangulating static 3D points, as well as sub-frame temporal alignment between cameras and computing 3D trajectories of dynamic points. Key to our joint optimization is the careful integration of physics-based motion priors within the reconstruction pipeline, validated on a large motion capture corpus of human subjects. We devise an incremental reconstruction and alignment algorithm to strictly enforce the motion prior during the spatiotemporal bundle adjustment. This algorithm is further made more efficient by a divide and conquer scheme while still maintaining high accuracy. We apply this algorithm to reconstruct 3D motion trajectories of human bodies in dynamic events captured by multiple uncalibrated and unsynchronized video cameras in the wild. To make the reconstruction visually more interpretable, we fit a statistical 3D human body model to the asynchronous video streams.Compared to the baseline, the fitting significantly benefits from the proposed spatiotemporal bundle adjustment procedure. Because the videos are aligned with sub-frame precision, we reconstruct 3D motion at much higher temporal resolution than the input videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.12806v1-abstract-full').style.display = 'none'; document.getElementById('2007.12806v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE TPAMI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.03672">arXiv:2007.03672</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.03672">pdf</a>, <a href="https://arxiv.org/format/2007.03672">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Long-term Human Motion Prediction with Scene Context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Z">Zhe Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+H">Hang Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Mangalam%2C+K">Karttikeya Mangalam</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Q">Qi-Zhi Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Malik%2C+J">Jitendra Malik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.03672v3-abstract-short" style="display: inline;"> Human movement is goal-directed and influenced by the spatial layout of the objects in the scene. To plan future human motion, it is crucial to perceive the environment -- imagine how hard it is to navigate a new room with lights off. Existing works on predicting human motion do not pay attention to the scene context and thus struggle in long-term prediction. In this work, we propose a novel three&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.03672v3-abstract-full').style.display = 'inline'; document.getElementById('2007.03672v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.03672v3-abstract-full" style="display: none;"> Human movement is goal-directed and influenced by the spatial layout of the objects in the scene. To plan future human motion, it is crucial to perceive the environment -- imagine how hard it is to navigate a new room with lights off. Existing works on predicting human motion do not pay attention to the scene context and thus struggle in long-term prediction. In this work, we propose a novel three-stage framework that exploits scene context to tackle this task. Given a single scene image and 2D pose histories, our method first samples multiple human motion goals, then plans 3D human paths towards each goal, and finally predicts 3D human pose sequences following each path. For stable training and rigorous evaluation, we contribute a diverse synthetic dataset with clean annotations. In both synthetic and real datasets, our method shows consistent quantitative and qualitative improvements over existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.03672v3-abstract-full').style.display = 'none'; document.getElementById('2007.03672v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2020 Oral. Dataset &amp; Code: https://github.com/ZheC/GTA-IM-Dataset Video: https://people.eecs.berkeley.edu/~zhecao/hmp/index.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.13532">arXiv:2005.13532</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.13532">pdf</a>, <a href="https://arxiv.org/format/2005.13532">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> 4D Visualization of Dynamic Events from Unconstrained Multi-View Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bansal%2C+A">Aayush Bansal</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Sheikh%2C+Y">Yaser Sheikh</a>, <a href="/search/cs?searchtype=author&amp;query=Ramanan%2C+D">Deva Ramanan</a>, <a href="/search/cs?searchtype=author&amp;query=Narasimhan%2C+S">Srinivasa Narasimhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.13532v1-abstract-short" style="display: inline;"> We present a data-driven approach for 4D space-time visualization of dynamic events from videos captured by hand-held multiple cameras. Key to our approach is the use of self-supervised neural networks specific to the scene to compose static and dynamic aspects of an event. Though captured from discrete viewpoints, this model enables us to move around the space-time of the event continuously. This&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.13532v1-abstract-full').style.display = 'inline'; document.getElementById('2005.13532v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.13532v1-abstract-full" style="display: none;"> We present a data-driven approach for 4D space-time visualization of dynamic events from videos captured by hand-held multiple cameras. Key to our approach is the use of self-supervised neural networks specific to the scene to compose static and dynamic aspects of an event. Though captured from discrete viewpoints, this model enables us to move around the space-time of the event continuously. This model allows us to create virtual cameras that facilitate: (1) freezing the time and exploring views; (2) freezing a view and moving through time; and (3) simultaneously changing both time and view. We can also edit the videos and reveal occluded objects for a given view if it is visible in any of the other views. We validate our approach on challenging in-the-wild events captured using up to 15 mobile cameras. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.13532v1-abstract-full').style.display = 'none'; document.getElementById('2005.13532v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page - http://www.cs.cmu.edu/~aayushb/Open4D/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.08079">arXiv:1911.08079</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1911.08079">pdf</a>, <a href="https://arxiv.org/format/1911.08079">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s00138-020-01086-1">10.1007/s00138-020-01086-1 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Two-Stream FCNs to Balance Content and Style for Style Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Sugimoto%2C+A">Akihiro Sugimoto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.08079v2-abstract-short" style="display: inline;"> Style transfer is to render given image contents in given styles, and it has an important role in both computer vision fundamental research and industrial applications. Following the success of deep learning based approaches, this problem has been re-launched recently, but still remains a difficult task because of trade-off between preserving contents and faithful rendering of styles. Indeed, how&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.08079v2-abstract-full').style.display = 'inline'; document.getElementById('1911.08079v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.08079v2-abstract-full" style="display: none;"> Style transfer is to render given image contents in given styles, and it has an important role in both computer vision fundamental research and industrial applications. Following the success of deep learning based approaches, this problem has been re-launched recently, but still remains a difficult task because of trade-off between preserving contents and faithful rendering of styles. Indeed, how well-balanced content and style are is crucial in evaluating the quality of stylized images. In this paper, we propose an end-to-end two-stream Fully Convolutional Networks (FCNs) aiming at balancing the contributions of the content and the style in rendered images. Our proposed network consists of the encoder and decoder parts. The encoder part utilizes a FCN for content and a FCN for style where the two FCNs have feature injections and are independently trained to preserve the semantic content and to learn the faithful style representation in each. The semantic content feature and the style representation feature are then concatenated adaptively and fed into the decoder to generate style-transferred (stylized) images. In order to train our proposed network, we employ a loss network, the pre-trained VGG-16, to compute content loss and style loss, both of which are efficiently used for the feature injection as well as the feature concatenation. Our intensive experiments show that our proposed model generates more balanced stylized images in content and style than state-of-the-art methods. Moreover, our proposed network achieves efficiency in speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.08079v2-abstract-full').style.display = 'none'; document.getElementById('1911.08079v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">published in Machine Vision and Applications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.01741">arXiv:1908.01741</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1908.01741">pdf</a>, <a href="https://arxiv.org/format/1908.01741">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Visual-Relation Conscious Image Generation from Structured-Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vo%2C+D+M">Duc Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Sugimoto%2C+A">Akihiro Sugimoto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.01741v3-abstract-short" style="display: inline;"> We propose an end-to-end network for image generation from given structured-text that consists of the visual-relation layout module and the pyramid of GANs, namely stacking-GANs. Our visual-relation layout module uses relations among entities in the structured-text in two ways: comprehensive usage and individual usage. We comprehensively use all available relations together to localize initial bou&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.01741v3-abstract-full').style.display = 'inline'; document.getElementById('1908.01741v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.01741v3-abstract-full" style="display: none;"> We propose an end-to-end network for image generation from given structured-text that consists of the visual-relation layout module and the pyramid of GANs, namely stacking-GANs. Our visual-relation layout module uses relations among entities in the structured-text in two ways: comprehensive usage and individual usage. We comprehensively use all available relations together to localize initial bounding-boxes of all the entities. We also use individual relation separately to predict from the initial bounding-boxes relation-units for all the relations in the input text. We then unify all the relation-units to produce the visual-relation layout, i.e., bounding-boxes for all the entities so that each of them uniquely corresponds to each entity while keeping its involved relations. Our visual-relation layout reflects the scene structure given in the input text. The stacking-GANs is the stack of three GANs conditioned on the visual-relation layout and the output of previous GAN, consistently capturing the scene structure. Our network realistically renders entities&#39; details in high resolution while keeping the scene structure. Experimental results on two public datasets show outperformances of our method against state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.01741v3-abstract-full').style.display = 'none'; document.getElementById('1908.01741v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at ECCV 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1805.08717">arXiv:1805.08717</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1805.08717">pdf</a>, <a href="https://arxiv.org/format/1805.08717">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TPAMI.2020.2974726">10.1109/TPAMI.2020.2974726 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Self-supervised Multi-view Person Association and Its Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Yumer%2C+E">Ersin Yumer</a>, <a href="/search/cs?searchtype=author&amp;query=Sunkavalli%2C+K">Kalyan Sunkavalli</a>, <a href="/search/cs?searchtype=author&amp;query=Hadap%2C+S">Sunil Hadap</a>, <a href="/search/cs?searchtype=author&amp;query=Sheikh%2C+Y">Yaser Sheikh</a>, <a href="/search/cs?searchtype=author&amp;query=Narasimhan%2C+S">Srinivasa Narasimhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1805.08717v3-abstract-short" style="display: inline;"> Reliable markerless motion tracking of people participating in a complex group activity from multiple moving cameras is challenging due to frequent occlusions, strong viewpoint and appearance variations, and asynchronous video streams. To solve this problem, reliable association of the same person across distant viewpoints and temporal instances is essential. We present a self-supervised framework&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.08717v3-abstract-full').style.display = 'inline'; document.getElementById('1805.08717v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1805.08717v3-abstract-full" style="display: none;"> Reliable markerless motion tracking of people participating in a complex group activity from multiple moving cameras is challenging due to frequent occlusions, strong viewpoint and appearance variations, and asynchronous video streams. To solve this problem, reliable association of the same person across distant viewpoints and temporal instances is essential. We present a self-supervised framework to adapt a generic person appearance descriptor to the unlabeled videos by exploiting motion tracking, mutual exclusion constraints, and multi-view geometry. The adapted discriminative descriptor is used in a tracking-by-clustering formulation. We validate the effectiveness of our descriptor learning on WILDTRACK [14] and three new complex social scenes captured by multiple cameras with up to 60 people &#34;in the wild&#34;. We report significant improvement in association accuracy (up to 18%) and stable and coherent 3D human skeleton tracking (5 to 10 times) over the baseline. Using the reconstructed 3D skeletons, we cut the input videos into a multi-angle video where the image of a specified person is shown from the best visible front-facing camera. Our algorithm detects inter-human occlusion to determine the camera switching moment while still maintaining the flow of the action well. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.08717v3-abstract-full').style.display = 'none'; document.getElementById('1805.08717v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE TPAMI</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10