CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 64 results for author: <span class="mathjax">Poggi, M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Poggi%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Poggi, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Poggi%2C+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Poggi, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Poggi%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Poggi%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Poggi%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14053">arXiv:2411.14053</a> <span> [<a href="https://arxiv.org/pdf/2411.14053">pdf</a>, <a href="https://arxiv.org/format/2411.14053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stereo Anything: Unifying Stereo Matching with Large-Scale Mixed Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xianda Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+D">Dujun Nie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruilin Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wenzhao Zheng</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14053v1-abstract-short" style="display: inline;"> Stereo matching has been a pivotal component in 3D vision, aiming to find corresponding points between pairs of stereo images to recover depth information. In this work, we introduce StereoAnything, a highly practical solution for robust stereo matching. Rather than focusing on a specialized model, our goal is to develop a versatile foundational model capable of handling stereo images across diver… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14053v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14053v1-abstract-full" style="display: none;"> Stereo matching has been a pivotal component in 3D vision, aiming to find corresponding points between pairs of stereo images to recover depth information. In this work, we introduce StereoAnything, a highly practical solution for robust stereo matching. Rather than focusing on a specialized model, our goal is to develop a versatile foundational model capable of handling stereo images across diverse environments. To this end, we scale up the dataset by collecting labeled stereo images and generating synthetic stereo pairs from unlabeled monocular images. To further enrich the model's ability to generalize across different conditions, we introduce a novel synthetic dataset that complements existing data by adding variability in baselines, camera angles, and scene types. We extensively evaluate the zero-shot capabilities of our model on five public datasets, showcasing its impressive ability to generalize to new, unseen data. Code will be available at \url{https://github.com/XiandaGuo/OpenStereo}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14053v1-abstract-full').style.display = 'none'; document.getElementById('2411.14053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code will be available at \url{https://github.com/XiandaGuo/OpenStereo}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08277">arXiv:2409.08277</a> <span> [<a href="https://arxiv.org/pdf/2409.08277">pdf</a>, <a href="https://arxiv.org/format/2409.08277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Depth on Demand: Streaming Dense Depth from a Low Frame Rate Active Sensor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Conti%2C+A">Andrea Conti</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Cambareri%2C+V">Valerio Cambareri</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08277v1-abstract-short" style="display: inline;"> High frame rate and accurate depth estimation plays an important role in several tasks crucial to robotics and automotive perception. To date, this can be achieved through ToF and LiDAR devices for indoor and outdoor applications, respectively. However, their applicability is limited by low frame rate, energy consumption, and spatial sparsity. Depth on Demand (DoD) allows for accurate temporal and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08277v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08277v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08277v1-abstract-full" style="display: none;"> High frame rate and accurate depth estimation plays an important role in several tasks crucial to robotics and automotive perception. To date, this can be achieved through ToF and LiDAR devices for indoor and outdoor applications, respectively. However, their applicability is limited by low frame rate, energy consumption, and spatial sparsity. Depth on Demand (DoD) allows for accurate temporal and spatial depth densification achieved by exploiting a high frame rate RGB sensor coupled with a potentially lower frame rate and sparse active depth sensor. Our proposal jointly enables lower energy consumption and denser shape reconstruction, by significantly reducing the streaming requirements on the depth sensor thanks to its three core stages: i) multi-modal encoding, ii) iterative multi-modal integration, and iii) depth decoding. We present extended evidence assessing the effectiveness of DoD on indoor and outdoor video datasets, covering both environment scanning and automotive perception use cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08277v1-abstract-full').style.display = 'none'; document.getElementById('2409.08277v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at the European Conference on Computer Vision (ECCV) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07456">arXiv:2409.07456</a> <span> [<a href="https://arxiv.org/pdf/2409.07456">pdf</a>, <a href="https://arxiv.org/format/2409.07456">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-Evolving Depth-Supervised 3D Gaussian Splatting from Rendered Stereo Pairs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Safadoust%2C+S">Sadra Safadoust</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=G%C3%BCney%2C+F">Fatma G眉ney</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07456v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (GS) significantly struggles to accurately represent the underlying 3D scene geometry, resulting in inaccuracies and floating artifacts when rendering depth maps. In this paper, we address this limitation, undertaking a comprehensive analysis of the integration of depth priors throughout the optimization process of Gaussian primitives, and present a novel strategy for this pu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07456v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07456v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07456v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (GS) significantly struggles to accurately represent the underlying 3D scene geometry, resulting in inaccuracies and floating artifacts when rendering depth maps. In this paper, we address this limitation, undertaking a comprehensive analysis of the integration of depth priors throughout the optimization process of Gaussian primitives, and present a novel strategy for this purpose. This latter dynamically exploits depth cues from a readily available stereo network, processing virtual stereo pairs rendered by the GS model itself during training and achieving consistent self-improvement of the scene representation. Experimental results on three popular datasets, breaking ground as the first to assess depth accuracy for these models, validate our findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07456v1-abstract-full').style.display = 'none'; document.getElementById('2409.07456v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">BMVC 2024. Project page: https://kuis-ai.github.io/StereoGS/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04633">arXiv:2408.04633</a> <span> [<a href="https://arxiv.org/pdf/2408.04633">pdf</a>, <a href="https://arxiv.org/format/2408.04633">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LiDAR-Event Stereo Fusion with Hallucinations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bartolomei%2C+L">Luca Bartolomei</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Conti%2C+A">Andrea Conti</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04633v1-abstract-short" style="display: inline;"> Event stereo matching is an emerging technique to estimate depth from neuromorphic cameras; however, events are unlikely to trigger in the absence of motion or the presence of large, untextured regions, making the correspondence problem extremely challenging. Purposely, we propose integrating a stereo event camera with a fixed-frequency active sensor -- e.g., a LiDAR -- collecting sparse depth mea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04633v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04633v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04633v1-abstract-full" style="display: none;"> Event stereo matching is an emerging technique to estimate depth from neuromorphic cameras; however, events are unlikely to trigger in the absence of motion or the presence of large, untextured regions, making the correspondence problem extremely challenging. Purposely, we propose integrating a stereo event camera with a fixed-frequency active sensor -- e.g., a LiDAR -- collecting sparse depth measurements, overcoming the aforementioned limitations. Such depth hints are used by hallucinating -- i.e., inserting fictitious events -- the stacks or raw input streams, compensating for the lack of information in the absence of brightness changes. Our techniques are general, can be adapted to any structured representation to stack events and outperform state-of-the-art fusion methods applied to event-based stereo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04633v1-abstract-full').style.display = 'none'; document.getElementById('2408.04633v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024. Code: https://github.com/bartn8/eventvppstereo/ - Project Page: https://eventvppstereo.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16698">arXiv:2407.16698</a> <span> [<a href="https://arxiv.org/pdf/2407.16698">pdf</a>, <a href="https://arxiv.org/format/2407.16698">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Models for Monocular Depth Estimation: Overcoming Challenging Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Ramirez%2C+P+Z">Pierluigi Zama Ramirez</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16698v1-abstract-short" style="display: inline;"> We present a novel approach designed to address the complexities posed by challenging, out-of-distribution data in the single-image depth estimation task. Starting with images that facilitate depth prediction due to the absence of unfavorable factors, we systematically generate new, user-defined scenes with a comprehensive set of challenges and associated depth information. This is achieved by lev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16698v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16698v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16698v1-abstract-full" style="display: none;"> We present a novel approach designed to address the complexities posed by challenging, out-of-distribution data in the single-image depth estimation task. Starting with images that facilitate depth prediction due to the absence of unfavorable factors, we systematically generate new, user-defined scenes with a comprehensive set of challenges and associated depth information. This is achieved by leveraging cutting-edge text-to-image diffusion models with depth-aware control, known for synthesizing high-quality image content from textual prompts while preserving the coherence of 3D structure between generated and source imagery. Subsequent fine-tuning of any monocular depth network is carried out through a self-distillation protocol that takes into account images generated using our strategy and its own depth predictions on simple, unchallenging scenes. Experiments on benchmarks tailored for our purposes demonstrate the effectiveness and versatility of our proposal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16698v1-abstract-full').style.display = 'none'; document.getElementById('2407.16698v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024. Code: https://github.com/fabiotosi92/Diffusion4RobustDepth Project page: https://diffusion4robustdepth.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07816">arXiv:2407.07816</a> <span> [<a href="https://arxiv.org/pdf/2407.07816">pdf</a>, <a href="https://arxiv.org/format/2407.07816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Deep Stereo Matching in the Twenties </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Bartolomei%2C+L">Luca Bartolomei</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07816v1-abstract-short" style="display: inline;"> Stereo matching is close to hitting a half-century of history, yet witnessed a rapid evolution in the last decade thanks to deep learning. While previous surveys in the late 2010s covered the first stage of this revolution, the last five years of research brought further ground-breaking advancements to the field. This paper aims to fill this gap in a two-fold manner: first, we offer an in-depth ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07816v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07816v1-abstract-full" style="display: none;"> Stereo matching is close to hitting a half-century of history, yet witnessed a rapid evolution in the last decade thanks to deep learning. While previous surveys in the late 2010s covered the first stage of this revolution, the last five years of research brought further ground-breaking advancements to the field. This paper aims to fill this gap in a two-fold manner: first, we offer an in-depth examination of the latest developments in deep stereo matching, focusing on the pioneering architectural designs and groundbreaking paradigms that have redefined the field in the 2020s; second, we present a thorough analysis of the critical challenges that have emerged alongside these advances, providing a comprehensive taxonomy of these issues and exploring the state-of-the-art techniques proposed to address them. By reviewing both the architectural innovations and the key challenges, we offer a holistic view of deep stereo matching and highlight the specific areas that require further investigation. To accompany this survey, we maintain a regularly updated project page that catalogs papers on deep stereo matching in our Awesome-Deep-Stereo-Matching (https://github.com/fabiotosi92/Awesome-Deep-Stereo-Matching) repository. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07816v1-abstract-full').style.display = 'none'; document.getElementById('2407.07816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended version of CVPR 2024 Tutorial "Deep Stereo Matching in the Twenties" (https://sites.google.com/view/stereo-twenties)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19833">arXiv:2406.19833</a> <span> [<a href="https://arxiv.org/pdf/2406.19833">pdf</a>, <a href="https://arxiv.org/format/2406.19833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LightStereo: Channel Boost Is All Your Need for Efficient 2D Cost Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xianda Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wenzhao Zheng</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+D">Dujun Nie</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19833v2-abstract-short" style="display: inline;"> We present LightStereo, a cutting-edge stereo-matching network crafted to accelerate the matching process. Departing from conventional methodologies that rely on aggregating computationally intensive 4D costs, LightStereo adopts the 3D cost volume as a lightweight alternative. While similar approaches have been explored previously, our breakthrough lies in enhancing performance through a dedicated… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19833v2-abstract-full').style.display = 'inline'; document.getElementById('2406.19833v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19833v2-abstract-full" style="display: none;"> We present LightStereo, a cutting-edge stereo-matching network crafted to accelerate the matching process. Departing from conventional methodologies that rely on aggregating computationally intensive 4D costs, LightStereo adopts the 3D cost volume as a lightweight alternative. While similar approaches have been explored previously, our breakthrough lies in enhancing performance through a dedicated focus on the channel dimension of the 3D cost volume, where the distribution of matching costs is encapsulated. Our exhaustive exploration has yielded plenty of strategies to amplify the capacity of the pivotal dimension, ensuring both precision and efficiency. We compare the proposed LightStereo with existing state-of-the-art methods across various benchmarks, which demonstrate its superior performance in speed, accuracy, and resource utilization. LightStereo achieves a competitive EPE metric in the SceneFlow datasets while demanding a minimum of only 22 GFLOPs and 17 ms of runtime, and ranks 1st on KITTI 2015 among real-time models. Our comprehensive analysis reveals the effect of 2D cost aggregation for stereo matching, paving the way for real-world applications of efficient stereo systems. Code will be available at \url{https://github.com/XiandaGuo/OpenStereo}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19833v2-abstract-full').style.display = 'none'; document.getElementById('2406.19833v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code will be available at \url{https://github.com/XiandaGuo/OpenStereo}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04345">arXiv:2406.04345</a> <span> [<a href="https://arxiv.org/pdf/2406.04345">pdf</a>, <a href="https://arxiv.org/format/2406.04345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stereo-Depth Fusion through Virtual Pattern Projection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bartolomei%2C+L">Luca Bartolomei</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Conti%2C+A">Andrea Conti</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04345v1-abstract-short" style="display: inline;"> This paper presents a novel general-purpose stereo and depth data fusion paradigm that mimics the active stereo principle by replacing the unreliable physical pattern projector with a depth sensor. It works by projecting virtual patterns consistent with the scene geometry onto the left and right images acquired by a conventional stereo camera, using the sparse hints obtained from a depth sensor, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04345v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04345v1-abstract-full" style="display: none;"> This paper presents a novel general-purpose stereo and depth data fusion paradigm that mimics the active stereo principle by replacing the unreliable physical pattern projector with a depth sensor. It works by projecting virtual patterns consistent with the scene geometry onto the left and right images acquired by a conventional stereo camera, using the sparse hints obtained from a depth sensor, to facilitate the visual correspondence. Purposely, any depth sensing device can be seamlessly plugged into our framework, enabling the deployment of a virtual active stereo setup in any possible environment and overcoming the severe limitations of physical pattern projection, such as the limited working range and environmental conditions. Exhaustive experiments on indoor and outdoor datasets featuring both long and close range, including those providing raw, unfiltered depth hints from off-the-shelf depth sensors, highlight the effectiveness of our approach in notably boosting the robustness and accuracy of algorithms and deep stereo without any code modification and even without re-training. Additionally, we assess the performance of our strategy on active stereo evaluation datasets with conventional pattern projection. Indeed, in all these scenarios, our virtual pattern projection paradigm achieves state-of-the-art performance. The source code is available at: https://github.com/bartn8/vppstereo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04345v1-abstract-full').style.display = 'none'; document.getElementById('2406.04345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">extended version of ICCV 2023: "Active Stereo Without Pattern Projector"</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01493">arXiv:2406.01493</a> <span> [<a href="https://arxiv.org/pdf/2406.01493">pdf</a>, <a href="https://arxiv.org/format/2406.01493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Temporally Consistent Video Depth from Video Diffusion Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jiahao Shao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuanbo Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yujun Shen</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yiyi Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01493v2-abstract-short" style="display: inline;"> This work addresses the challenge of video depth estimation, which expects not only per-frame accuracy but, more importantly, cross-frame consistency. Instead of directly developing a depth estimator from scratch, we reformulate the prediction task into a conditional generation problem. This allows us to leverage the prior knowledge embedded in existing video generation models, thereby reducing le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01493v2-abstract-full').style.display = 'inline'; document.getElementById('2406.01493v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01493v2-abstract-full" style="display: none;"> This work addresses the challenge of video depth estimation, which expects not only per-frame accuracy but, more importantly, cross-frame consistency. Instead of directly developing a depth estimator from scratch, we reformulate the prediction task into a conditional generation problem. This allows us to leverage the prior knowledge embedded in existing video generation models, thereby reducing learning difficulty and enhancing generalizability. Concretely, we study how to tame the public Stable Video Diffusion (SVD) to predict reliable depth from input videos using a mixture of image depth and video depth datasets. We empirically confirm that a procedural training strategy -- first optimizing the spatial layers of SVD and then optimizing the temporal layers while keeping the spatial layers frozen -- yields the best results in terms of both spatial accuracy and temporal consistency. We further examine the sliding window strategy for inference on arbitrarily long videos. Our observations indicate a trade-off between efficiency and performance, with a one-frame overlap already producing favorable results. Extensive experimental results demonstrate the superiority of our approach, termed ChronoDepth, over existing alternatives, particularly in terms of the temporal consistency of the estimated depth. Additionally, we highlight the benefits of more consistent video depth in two practical applications: depth-conditioned video generation and novel view synthesis. Our project page is available at https://jhaoshao.github.io/ChronoDepth/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01493v2-abstract-full').style.display = 'none'; document.getElementById('2406.01493v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14873">arXiv:2405.14873</a> <span> [<a href="https://arxiv.org/pdf/2405.14873">pdf</a>, <a href="https://arxiv.org/format/2405.14873">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Federated Online Adaptation for Deep Stereo </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14873v1-abstract-short" style="display: inline;"> We introduce a novel approach for adapting deep stereo networks in a collaborative manner. By building over principles of federated learning, we develop a distributed framework allowing for demanding the optimization process to a number of clients deployed in different environments. This makes it possible, for a deep stereo network running on resourced-constrained devices, to capitalize on the ada… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14873v1-abstract-full').style.display = 'inline'; document.getElementById('2405.14873v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14873v1-abstract-full" style="display: none;"> We introduce a novel approach for adapting deep stereo networks in a collaborative manner. By building over principles of federated learning, we develop a distributed framework allowing for demanding the optimization process to a number of clients deployed in different environments. This makes it possible, for a deep stereo network running on resourced-constrained devices, to capitalize on the adaptation process carried out by other instances of the same architecture, and thus improve its accuracy in challenging environments even when it cannot carry out adaptation on its own. Experimental results show how federated adaptation performs equivalently to on-device adaptation, and even better when dealing with challenging environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14873v1-abstract-full').style.display = 'none'; document.getElementById('2405.14873v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024. Project page: https://fedstereo.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10357">arXiv:2405.10357</a> <span> [<a href="https://arxiv.org/pdf/2405.10357">pdf</a>, <a href="https://arxiv.org/format/2405.10357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RGB Guided ToF Imaging System: A Survey of Deep Learning-based Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xin Qiao</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+P">Pengchao Deng</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hao Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+C">Chenyang Ge</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10357v1-abstract-short" style="display: inline;"> Integrating an RGB camera into a ToF imaging system has become a significant technique for perceiving the real world. The RGB guided ToF imaging system is crucial to several applications, including face anti-spoofing, saliency detection, and trajectory prediction. Depending on the distance of the working range, the implementation schemes of the RGB guided ToF imaging systems are different. Specifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10357v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10357v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10357v1-abstract-full" style="display: none;"> Integrating an RGB camera into a ToF imaging system has become a significant technique for perceiving the real world. The RGB guided ToF imaging system is crucial to several applications, including face anti-spoofing, saliency detection, and trajectory prediction. Depending on the distance of the working range, the implementation schemes of the RGB guided ToF imaging systems are different. Specifically, ToF sensors with a uniform field of illumination, which can output dense depth but have low resolution, are typically used for close-range measurements. In contrast, LiDARs, which emit laser pulses and can only capture sparse depth, are usually employed for long-range detection. In the two cases, depth quality improvement for RGB guided ToF imaging corresponds to two sub-tasks: guided depth super-resolution and guided depth completion. In light of the recent significant boost to the field provided by deep learning, this paper comprehensively reviews the works related to RGB guided ToF imaging, including network structures, learning strategies, evaluation metrics, benchmark datasets, and objective functions. Besides, we present quantitative comparisons of state-of-the-art methods on widely used benchmark datasets. Finally, we discuss future trends and the challenges in real applications for further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10357v1-abstract-full').style.display = 'none'; document.getElementById('2405.10357v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear on International Journal of Computer Vision (IJCV)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16831">arXiv:2404.16831</a> <span> [<a href="https://arxiv.org/pdf/2404.16831">pdf</a>, <a href="https://arxiv.org/format/2404.16831">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Third Monocular Depth Estimation Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Spencer%2C+J">Jaime Spencer</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+R+S">Ripudaman Singh Arora</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+C">Chris Russell</a>, <a href="/search/cs?searchtype=author&query=Hadfield%2C+S">Simon Hadfield</a>, <a href="/search/cs?searchtype=author&query=Bowden%2C+R">Richard Bowden</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">GuangYuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">ZhengXin Li</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+Q">Qiang Rao</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Y">YiPing Bao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dohyeong Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jinseong Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Myunghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Lavreniuk%2C+M">Mykola Lavreniuk</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Q">Qing Mao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiang Wu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yu Zhu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jinqiu Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanning Zhang</a>, <a href="/search/cs?searchtype=author&query=Patni%2C+S">Suraj Patni</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+A">Aradhye Agarwal</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+C">Chetan Arora</a> , et al. (16 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16831v2-abstract-short" style="display: inline;"> This paper discusses the results of the third edition of the Monocular Depth Estimation Challenge (MDEC). The challenge focuses on zero-shot generalization to the challenging SYNS-Patches dataset, featuring complex scenes in natural and indoor settings. As with the previous edition, methods can use any form of supervision, i.e. supervised or self-supervised. The challenge received a total of 19 su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16831v2-abstract-full').style.display = 'inline'; document.getElementById('2404.16831v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16831v2-abstract-full" style="display: none;"> This paper discusses the results of the third edition of the Monocular Depth Estimation Challenge (MDEC). The challenge focuses on zero-shot generalization to the challenging SYNS-Patches dataset, featuring complex scenes in natural and indoor settings. As with the previous edition, methods can use any form of supervision, i.e. supervised or self-supervised. The challenge received a total of 19 submissions outperforming the baseline on the test set: 10 among them submitted a report describing their approach, highlighting a diffused use of foundational models such as Depth Anything at the core of their method. The challenge winners drastically improved 3D F-Score performance, from 17.51% to 23.72%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16831v2-abstract-full').style.display = 'none'; document.getElementById('2404.16831v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in CVPRW2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.13255">arXiv:2402.13255</a> <span> [<a href="https://arxiv.org/pdf/2402.13255">pdf</a>, <a href="https://arxiv.org/format/2402.13255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> How NeRFs and 3D Gaussian Splatting are Reshaping SLAM: a Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Ziren Gong</a>, <a href="/search/cs?searchtype=author&query=Sandstr%C3%B6m%2C+E">Erik Sandstr枚m</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Oswald%2C+M+R">Martin R. Oswald</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.13255v2-abstract-short" style="display: inline;"> Over the past two decades, research in the field of Simultaneous Localization and Mapping (SLAM) has undergone a significant evolution, highlighting its critical role in enabling autonomous exploration of unknown environments. This evolution ranges from hand-crafted methods, through the era of deep learning, to more recent developments focused on Neural Radiance Fields (NeRFs) and 3D Gaussian Spla… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13255v2-abstract-full').style.display = 'inline'; document.getElementById('2402.13255v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.13255v2-abstract-full" style="display: none;"> Over the past two decades, research in the field of Simultaneous Localization and Mapping (SLAM) has undergone a significant evolution, highlighting its critical role in enabling autonomous exploration of unknown environments. This evolution ranges from hand-crafted methods, through the era of deep learning, to more recent developments focused on Neural Radiance Fields (NeRFs) and 3D Gaussian Splatting (3DGS) representations. Recognizing the growing body of research and the absence of a comprehensive survey on the topic, this paper aims to provide the first comprehensive overview of SLAM progress through the lens of the latest advancements in radiance fields. It sheds light on the background, evolutionary path, inherent strengths and limitations, and serves as a fundamental reference to highlight the dynamic progress and specific challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13255v2-abstract-full').style.display = 'none'; document.getElementById('2402.13255v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.14401">arXiv:2401.14401</a> <span> [<a href="https://arxiv.org/pdf/2401.14401">pdf</a>, <a href="https://arxiv.org/format/2401.14401">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Range-Agnostic Multi-View Depth Estimation With Keyframe Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Conti%2C+A">Andrea Conti</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Cambareri%2C+V">Valerio Cambareri</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.14401v1-abstract-short" style="display: inline;"> Methods for 3D reconstruction from posed frames require prior knowledge about the scene metric range, usually to recover matching cues along the epipolar lines and narrow the search range. However, such prior might not be directly available or estimated inaccurately in real scenarios -- e.g., outdoor 3D reconstruction from video sequences -- therefore heavily hampering performance. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14401v1-abstract-full').style.display = 'inline'; document.getElementById('2401.14401v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.14401v1-abstract-full" style="display: none;"> Methods for 3D reconstruction from posed frames require prior knowledge about the scene metric range, usually to recover matching cues along the epipolar lines and narrow the search range. However, such prior might not be directly available or estimated inaccurately in real scenarios -- e.g., outdoor 3D reconstruction from video sequences -- therefore heavily hampering performance. In this paper, we focus on multi-view depth estimation without requiring prior knowledge about the metric range of the scene by proposing RAMDepth, an efficient and purely 2D framework that reverses the depth estimation and matching steps order. Moreover, we demonstrate the capability of our framework to provide rich insights about the quality of the views used for prediction. Additional material can be found on our project page https://andreaconti.github.io/projects/range_agnostic_multi_view_depth. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14401v1-abstract-full').style.display = 'none'; document.getElementById('2401.14401v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3DV 2024 Project Page https://andreaconti.github.io/projects/range_agnostic_multi_view_depth GitHub Page https://github.com/andreaconti/ramdepth.git</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.09254">arXiv:2312.09254</a> <span> [<a href="https://arxiv.org/pdf/2312.09254">pdf</a>, <a href="https://arxiv.org/format/2312.09254">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Depth Completion from a Stereo Matching Perspective for Cross-domain Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bartolomei%2C+L">Luca Bartolomei</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Conti%2C+A">Andrea Conti</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.09254v1-abstract-short" style="display: inline;"> This paper proposes a new framework for depth completion robust against domain-shifting issues. It exploits the generalization capability of modern stereo networks to face depth completion, by processing fictitious stereo pairs obtained through a virtual pattern projection paradigm. Any stereo network or traditional stereo matcher can be seamlessly plugged into our framework, allowing for the depl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09254v1-abstract-full').style.display = 'inline'; document.getElementById('2312.09254v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.09254v1-abstract-full" style="display: none;"> This paper proposes a new framework for depth completion robust against domain-shifting issues. It exploits the generalization capability of modern stereo networks to face depth completion, by processing fictitious stereo pairs obtained through a virtual pattern projection paradigm. Any stereo network or traditional stereo matcher can be seamlessly plugged into our framework, allowing for the deployment of a virtual stereo setup that is future-proof against advancement in the stereo field. Exhaustive experiments on cross-domain generalization support our claims. Hence, we argue that our framework can help depth completion to reach new deployment scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09254v1-abstract-full').style.display = 'none'; document.getElementById('2312.09254v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3DV 2024. Code: https://github.com/bartn8/vppdc - Project page: https://vppdc.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.16019">arXiv:2309.16019</a> <span> [<a href="https://arxiv.org/pdf/2309.16019">pdf</a>, <a href="https://arxiv.org/format/2309.16019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GasMono: Geometry-Aided Self-Supervised Monocular Depth Estimation for Indoor Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chaoqiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lei Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qiyu Sun</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yang Tang</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.16019v1-abstract-short" style="display: inline;"> This paper tackles the challenges of self-supervised monocular depth estimation in indoor scenes caused by large rotation between frames and low texture. We ease the learning process by obtaining coarse camera poses from monocular sequences through multi-view geometry to deal with the former. However, we found that limited by the scale ambiguity across different scenes in the training dataset, a n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16019v1-abstract-full').style.display = 'inline'; document.getElementById('2309.16019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.16019v1-abstract-full" style="display: none;"> This paper tackles the challenges of self-supervised monocular depth estimation in indoor scenes caused by large rotation between frames and low texture. We ease the learning process by obtaining coarse camera poses from monocular sequences through multi-view geometry to deal with the former. However, we found that limited by the scale ambiguity across different scenes in the training dataset, a na茂ve introduction of geometric coarse poses cannot play a positive role in performance improvement, which is counter-intuitive. To address this problem, we propose to refine those poses during training through rotation and translation/scale optimization. To soften the effect of the low texture, we combine the global reasoning of vision transformers with an overfitting-aware, iterative self-distillation mechanism, providing more accurate depth guidance coming from the network itself. Experiments on NYUv2, ScanNet, 7scenes, and KITTI datasets support the effectiveness of each component in our framework, which sets a new state-of-the-art for indoor self-supervised monocular depth estimation, as well as outstanding generalization ability. Code and models are available at https://github.com/zxcqlf/GasMono <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16019v1-abstract-full').style.display = 'none'; document.getElementById('2309.16019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2023. Code: https://github.com/zxcqlf/GasMono</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.12315">arXiv:2309.12315</a> <span> [<a href="https://arxiv.org/pdf/2309.12315">pdf</a>, <a href="https://arxiv.org/format/2309.12315">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Active Stereo Without Pattern Projector </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bartolomei%2C+L">Luca Bartolomei</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Conti%2C+A">Andrea Conti</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.12315v1-abstract-short" style="display: inline;"> This paper proposes a novel framework integrating the principles of active stereo in standard passive camera systems without a physical pattern projector. We virtually project a pattern over the left and right images according to the sparse measurements obtained from a depth sensor. Any such devices can be seamlessly plugged into our framework, allowing for the deployment of a virtual active stere… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.12315v1-abstract-full').style.display = 'inline'; document.getElementById('2309.12315v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.12315v1-abstract-full" style="display: none;"> This paper proposes a novel framework integrating the principles of active stereo in standard passive camera systems without a physical pattern projector. We virtually project a pattern over the left and right images according to the sparse measurements obtained from a depth sensor. Any such devices can be seamlessly plugged into our framework, allowing for the deployment of a virtual active stereo setup in any possible environment, overcoming the limitation of pattern projectors, such as limited working range or environmental conditions. Experiments on indoor/outdoor datasets, featuring both long and close-range, support the seamless effectiveness of our approach, boosting the accuracy of both stereo algorithms and deep networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.12315v1-abstract-full').style.display = 'none'; document.getElementById('2309.12315v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2023. Code: https://github.com/bartn8/vppstereo - Project page: https://vppstereo.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.02436">arXiv:2309.02436</a> <span> [<a href="https://arxiv.org/pdf/2309.02436">pdf</a>, <a href="https://arxiv.org/format/2309.02436">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> GO-SLAM: Global Optimization for Consistent 3D Instant Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.02436v1-abstract-short" style="display: inline;"> Neural implicit representations have recently demonstrated compelling results on dense Simultaneous Localization And Mapping (SLAM) but suffer from the accumulation of errors in camera tracking and distortion in the reconstruction. Purposely, we present GO-SLAM, a deep-learning-based dense visual SLAM framework globally optimizing poses and 3D reconstruction in real-time. Robust pose estimation is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02436v1-abstract-full').style.display = 'inline'; document.getElementById('2309.02436v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.02436v1-abstract-full" style="display: none;"> Neural implicit representations have recently demonstrated compelling results on dense Simultaneous Localization And Mapping (SLAM) but suffer from the accumulation of errors in camera tracking and distortion in the reconstruction. Purposely, we present GO-SLAM, a deep-learning-based dense visual SLAM framework globally optimizing poses and 3D reconstruction in real-time. Robust pose estimation is at its core, supported by efficient loop closing and online full bundle adjustment, which optimize per frame by utilizing the learned global geometry of the complete history of input frames. Simultaneously, we update the implicit and continuous surface representation on-the-fly to ensure global consistency of 3D reconstruction. Results on various synthetic and real-world datasets demonstrate that GO-SLAM outperforms state-of-the-art approaches at tracking robustness and reconstruction accuracy. Furthermore, GO-SLAM is versatile and can run with monocular, stereo, and RGB-D input. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02436v1-abstract-full').style.display = 'none'; document.getElementById('2309.02436v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2023. Code: https://github.com/youmi-zym/GO-SLAM - Project Page: https://youmi-zym.github.io/projects/GO-SLAM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.14108">arXiv:2308.14108</a> <span> [<a href="https://arxiv.org/pdf/2308.14108">pdf</a>, <a href="https://arxiv.org/format/2308.14108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Depth self-supervision for single image novel view synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Minelli%2C+G">Giovanni Minelli</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Salti%2C+S">Samuele Salti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.14108v1-abstract-short" style="display: inline;"> In this paper, we tackle the problem of generating a novel image from an arbitrary viewpoint given a single frame as input. While existing methods operating in this setup aim at predicting the target view depth map to guide the synthesis, without explicit supervision over such a task, we jointly optimize our framework for both novel view synthesis and depth estimation to unleash the synergy betwee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14108v1-abstract-full').style.display = 'inline'; document.getElementById('2308.14108v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.14108v1-abstract-full" style="display: none;"> In this paper, we tackle the problem of generating a novel image from an arbitrary viewpoint given a single frame as input. While existing methods operating in this setup aim at predicting the target view depth map to guide the synthesis, without explicit supervision over such a task, we jointly optimize our framework for both novel view synthesis and depth estimation to unleash the synergy between the two at its best. Specifically, a shared depth decoder is trained in a self-supervised manner to predict depth maps that are consistent across the source and target views. Our results demonstrate the effectiveness of our approach in addressing the challenges of both tasks allowing for higher-quality generated images, as well as more accurate depth for the target viewpoint. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14108v1-abstract-full').style.display = 'none'; document.getElementById('2308.14108v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.15063">arXiv:2307.15063</a> <span> [<a href="https://arxiv.org/pdf/2307.15063">pdf</a>, <a href="https://arxiv.org/format/2307.15063">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> To Adapt or Not to Adapt? Real-Time Adaptation for Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Colomer%2C+M+B">Marc Botet Colomer</a>, <a href="/search/cs?searchtype=author&query=Dovesi%2C+P+L">Pier Luigi Dovesi</a>, <a href="/search/cs?searchtype=author&query=Panagiotakopoulos%2C+T">Theodoros Panagiotakopoulos</a>, <a href="/search/cs?searchtype=author&query=Carvalho%2C+J+F">Joao Frederico Carvalho</a>, <a href="/search/cs?searchtype=author&query=H%C3%A4renstam-Nielsen%2C+L">Linus H盲renstam-Nielsen</a>, <a href="/search/cs?searchtype=author&query=Azizpour%2C+H">Hossein Azizpour</a>, <a href="/search/cs?searchtype=author&query=Kjellstr%C3%B6m%2C+H">Hedvig Kjellstr枚m</a>, <a href="/search/cs?searchtype=author&query=Cremers%2C+D">Daniel Cremers</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.15063v2-abstract-short" style="display: inline;"> The goal of Online Domain Adaptation for semantic segmentation is to handle unforeseeable domain changes that occur during deployment, like sudden weather events. However, the high computational costs associated with brute-force adaptation make this paradigm unfeasible for real-world applications. In this paper we propose HAMLET, a Hardware-Aware Modular Least Expensive Training framework for real… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.15063v2-abstract-full').style.display = 'inline'; document.getElementById('2307.15063v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.15063v2-abstract-full" style="display: none;"> The goal of Online Domain Adaptation for semantic segmentation is to handle unforeseeable domain changes that occur during deployment, like sudden weather events. However, the high computational costs associated with brute-force adaptation make this paradigm unfeasible for real-world applications. In this paper we propose HAMLET, a Hardware-Aware Modular Least Expensive Training framework for real-time domain adaptation. Our approach includes a hardware-aware back-propagation orchestration agent (HAMT) and a dedicated domain-shift detector that enables active control over when and how the model is adapted (LT). Thanks to these advancements, our approach is capable of performing semantic segmentation while simultaneously adapting at more than 29FPS on a single consumer-grade GPU. Our framework's encouraging accuracy and speed trade-off is demonstrated on OnDA and SHIFT benchmarks through experimental results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.15063v2-abstract-full').style.display = 'none'; document.getElementById('2307.15063v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2023. The first two authors contributed equally. Project page: https://marcbotet.github.io/hamlet-web/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.15052">arXiv:2307.15052</a> <span> [<a href="https://arxiv.org/pdf/2307.15052">pdf</a>, <a href="https://arxiv.org/format/2307.15052">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Depth Estimation for Transparent and Mirror Surfaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Costanzino%2C+A">Alex Costanzino</a>, <a href="/search/cs?searchtype=author&query=Ramirez%2C+P+Z">Pierluigi Zama Ramirez</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Di+Stefano%2C+L">Luigi Di Stefano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.15052v1-abstract-short" style="display: inline;"> Inferring the depth of transparent or mirror (ToM) surfaces represents a hard challenge for either sensors, algorithms, or deep networks. We propose a simple pipeline for learning to estimate depth properly for such surfaces with neural networks, without requiring any ground-truth annotation. We unveil how to obtain reliable pseudo labels by in-painting ToM objects in images and processing them wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.15052v1-abstract-full').style.display = 'inline'; document.getElementById('2307.15052v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.15052v1-abstract-full" style="display: none;"> Inferring the depth of transparent or mirror (ToM) surfaces represents a hard challenge for either sensors, algorithms, or deep networks. We propose a simple pipeline for learning to estimate depth properly for such surfaces with neural networks, without requiring any ground-truth annotation. We unveil how to obtain reliable pseudo labels by in-painting ToM objects in images and processing them with a monocular depth estimation model. These labels can be used to fine-tune existing monocular or stereo networks, to let them learn how to deal with ToM surfaces. Experimental results on the Booster dataset show the dramatic improvements enabled by our remarkably simple proposal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.15052v1-abstract-full').style.display = 'none'; document.getElementById('2307.15052v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICCV 2023. Project Page: https://cvlab-unibo.github.io/Depth4ToM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.07051">arXiv:2304.07051</a> <span> [<a href="https://arxiv.org/pdf/2304.07051">pdf</a>, <a href="https://arxiv.org/format/2304.07051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Second Monocular Depth Estimation Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Spencer%2C+J">Jaime Spencer</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C+S">C. Stella Qian</a>, <a href="/search/cs?searchtype=author&query=Trescakova%2C+M">Michaela Trescakova</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+C">Chris Russell</a>, <a href="/search/cs?searchtype=author&query=Hadfield%2C+S">Simon Hadfield</a>, <a href="/search/cs?searchtype=author&query=Graf%2C+E+W">Erich W. Graf</a>, <a href="/search/cs?searchtype=author&query=Adams%2C+W+J">Wendy J. Adams</a>, <a href="/search/cs?searchtype=author&query=Schofield%2C+A+J">Andrew J. Schofield</a>, <a href="/search/cs?searchtype=author&query=Elder%2C+J">James Elder</a>, <a href="/search/cs?searchtype=author&query=Bowden%2C+R">Richard Bowden</a>, <a href="/search/cs?searchtype=author&query=Anwar%2C+A">Ali Anwar</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaozhi Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+K">Kai Cheng</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yuchao Dai</a>, <a href="/search/cs?searchtype=author&query=Hoa%2C+H+T">Huynh Thai Hoa</a>, <a href="/search/cs?searchtype=author&query=Hossain%2C+S">Sadat Hossain</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianmian Huang</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+M">Mohan Jing</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Baojun Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwen Liu</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Mercelis%2C+S">Siegfried Mercelis</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.07051v3-abstract-short" style="display: inline;"> This paper discusses the results for the second edition of the Monocular Depth Estimation Challenge (MDEC). This edition was open to methods using any form of supervision, including fully-supervised, self-supervised, multi-task or proxy depth. The challenge was based around the SYNS-Patches dataset, which features a wide diversity of environments with high-quality dense ground-truth. This includes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.07051v3-abstract-full').style.display = 'inline'; document.getElementById('2304.07051v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.07051v3-abstract-full" style="display: none;"> This paper discusses the results for the second edition of the Monocular Depth Estimation Challenge (MDEC). This edition was open to methods using any form of supervision, including fully-supervised, self-supervised, multi-task or proxy depth. The challenge was based around the SYNS-Patches dataset, which features a wide diversity of environments with high-quality dense ground-truth. This includes complex natural environments, e.g. forests or fields, which are greatly underrepresented in current benchmarks. The challenge received eight unique submissions that outperformed the provided SotA baseline on any of the pointcloud- or image-based metrics. The top supervised submission improved relative F-Score by 27.62%, while the top self-supervised improved it by 16.61%. Supervised submissions generally leveraged large collections of datasets to improve data diversity. Self-supervised submissions instead updated the network architecture and pretrained backbones. These results represent a significant progress in the field, while highlighting avenues for future research, such as reducing interpolation artifacts at depth boundaries, improving self-supervised indoor performance and overall natural image accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.07051v3-abstract-full').style.display = 'none'; document.getElementById('2304.07051v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at CVPRW2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.17603">arXiv:2303.17603</a> <span> [<a href="https://arxiv.org/pdf/2303.17603">pdf</a>, <a href="https://arxiv.org/format/2303.17603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> NeRF-Supervised Deep Stereo </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Tonioni%2C+A">Alessio Tonioni</a>, <a href="/search/cs?searchtype=author&query=De+Gregorio%2C+D">Daniele De Gregorio</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.17603v1-abstract-short" style="display: inline;"> We introduce a novel framework for training deep stereo networks effortlessly and without any ground-truth. By leveraging state-of-the-art neural rendering solutions, we generate stereo training data from image sequences collected with a single handheld camera. On top of them, a NeRF-supervised training procedure is carried out, from which we exploit rendered stereo triplets to compensate for occl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17603v1-abstract-full').style.display = 'inline'; document.getElementById('2303.17603v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.17603v1-abstract-full" style="display: none;"> We introduce a novel framework for training deep stereo networks effortlessly and without any ground-truth. By leveraging state-of-the-art neural rendering solutions, we generate stereo training data from image sequences collected with a single handheld camera. On top of them, a NeRF-supervised training procedure is carried out, from which we exploit rendered stereo triplets to compensate for occlusions and depth maps as proxy labels. This results in stereo networks capable of predicting sharp and detailed disparity maps. Experimental results show that models trained under this regime yield a 30-40% improvement over existing self-supervised methods on the challenging Middlebury dataset, filling the gap to supervised models and, most times, outperforming them at zero-shot generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17603v1-abstract-full').style.display = 'none'; document.getElementById('2303.17603v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2023. Project page: https://nerfstereo.github.io/ Code: https://github.com/fabiotosi92/NeRF-Supervised-Deep-Stereo</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.09307">arXiv:2303.09307</a> <span> [<a href="https://arxiv.org/pdf/2303.09307">pdf</a>, <a href="https://arxiv.org/format/2303.09307">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Depth Super-Resolution from Explicit and Implicit High-Frequency Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+X">Xin Qiao</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+C">Chenyang Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yanhui Zhou</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.09307v2-abstract-short" style="display: inline;"> We propose a novel multi-stage depth super-resolution network, which progressively reconstructs high-resolution depth maps from explicit and implicit high-frequency features. The former are extracted by an efficient transformer processing both local and global contexts, while the latter are obtained by projecting color images into the frequency domain. Both are combined together with depth feature… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09307v2-abstract-full').style.display = 'inline'; document.getElementById('2303.09307v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.09307v2-abstract-full" style="display: none;"> We propose a novel multi-stage depth super-resolution network, which progressively reconstructs high-resolution depth maps from explicit and implicit high-frequency features. The former are extracted by an efficient transformer processing both local and global contexts, while the latter are obtained by projecting color images into the frequency domain. Both are combined together with depth features by means of a fusion strategy within a multi-stage and multi-scale framework. Experiments on the main benchmarks, such as NYUv2, Middlebury, DIML and RGBDD, show that our approach outperforms existing methods by a large margin (~20% on NYUv2 and DIML against the contemporary work DADA, with 16x upsampling), establishing a new state-of-the-art in the guided depth super-resolution task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09307v2-abstract-full').style.display = 'none'; document.getElementById('2303.09307v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.02450">arXiv:2302.02450</a> <span> [<a href="https://arxiv.org/pdf/2302.02450">pdf</a>, <a href="https://arxiv.org/format/2302.02450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Regularization and Optimization in Model-Based Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sampaio%2C+R+A">Raphael Araujo Sampaio</a>, <a href="/search/cs?searchtype=author&query=Garcia%2C+J+D">Joaquim Dias Garcia</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Marcus Poggi</a>, <a href="/search/cs?searchtype=author&query=Vidal%2C+T">Thibaut Vidal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.02450v2-abstract-short" style="display: inline;"> Due to their conceptual simplicity, k-means algorithm variants have been extensively used for unsupervised cluster analysis. However, one main shortcoming of these algorithms is that they essentially fit a mixture of identical spherical Gaussians to data that vastly deviates from such a distribution. In comparison, general Gaussian Mixture Models (GMMs) can fit richer structures but require estima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.02450v2-abstract-full').style.display = 'inline'; document.getElementById('2302.02450v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.02450v2-abstract-full" style="display: none;"> Due to their conceptual simplicity, k-means algorithm variants have been extensively used for unsupervised cluster analysis. However, one main shortcoming of these algorithms is that they essentially fit a mixture of identical spherical Gaussians to data that vastly deviates from such a distribution. In comparison, general Gaussian Mixture Models (GMMs) can fit richer structures but require estimating a quadratic number of parameters per cluster to represent the covariance matrices. This poses two main issues: (i) the underlying optimization problems are challenging due to their larger number of local minima, and (ii) their solutions can overfit the data. In this work, we design search strategies that circumvent both issues. We develop more effective optimization algorithms for general GMMs, and we combine these algorithms with regularization strategies that avoid overfitting. Through extensive computational analyses, we observe that optimization or regularization in isolation does not substantially improve cluster recovery. However, combining these techniques permits a completely new level of performance previously unachieved by k-means algorithm variants, unraveling vastly different cluster structures. These results shed new light on the current status quo between GMM and k-means methods and suggest the more frequent use of general GMMs for data exploration. To facilitate such applications, we provide open-source code as well as Julia packages (UnsupervisedClustering.jl and RegularizedCovarianceMatrices.jl) implementing the proposed techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.02450v2-abstract-full').style.display = 'none'; document.getElementById('2302.02450v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.08245">arXiv:2301.08245</a> <span> [<a href="https://arxiv.org/pdf/2301.08245">pdf</a>, <a href="https://arxiv.org/format/2301.08245">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Booster: a Benchmark for Depth from Images of Specular and Transparent Surfaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ramirez%2C+P+Z">Pierluigi Zama Ramirez</a>, <a href="/search/cs?searchtype=author&query=Costanzino%2C+A">Alex Costanzino</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Salti%2C+S">Samuele Salti</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Di+Stefano%2C+L">Luigi Di Stefano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.08245v3-abstract-short" style="display: inline;"> Estimating depth from images nowadays yields outstanding results, both in terms of in-domain accuracy and generalization. However, we identify two main challenges that remain open in this field: dealing with non-Lambertian materials and effectively processing high-resolution images. Purposely, we propose a novel dataset that includes accurate and dense ground-truth labels at high resolution, featu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.08245v3-abstract-full').style.display = 'inline'; document.getElementById('2301.08245v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.08245v3-abstract-full" style="display: none;"> Estimating depth from images nowadays yields outstanding results, both in terms of in-domain accuracy and generalization. However, we identify two main challenges that remain open in this field: dealing with non-Lambertian materials and effectively processing high-resolution images. Purposely, we propose a novel dataset that includes accurate and dense ground-truth labels at high resolution, featuring scenes containing several specular and transparent surfaces. Our acquisition pipeline leverages a novel deep space-time stereo framework, enabling easy and accurate labeling with sub-pixel precision. The dataset is composed of 606 samples collected in 85 different scenes, each sample includes both a high-resolution pair (12 Mpx) as well as an unbalanced stereo pair (Left: 12 Mpx, Right: 1.1 Mpx), typical of modern mobile devices that mount sensors with different resolutions. Additionally, we provide manually annotated material segmentation masks and 15K unlabeled samples. The dataset is composed of a train set and two test sets, the latter devoted to the evaluation of stereo and monocular depth estimation networks. Our experiments highlight the open challenges and future research directions in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.08245v3-abstract-full').style.display = 'none'; document.getElementById('2301.08245v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extension of the paper "Open Challenges in Deep Stereo: the Booster Dataset" presented at CVPR 2022. Accepted at TPAMI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.10806">arXiv:2212.10806</a> <span> [<a href="https://arxiv.org/pdf/2212.10806">pdf</a>, <a href="https://arxiv.org/format/2212.10806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MaskingDepth: Masked Consistency Regularization for Semi-supervised Monocular Depth Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Baek%2C+J">Jongbeom Baek</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+G">Gyeongnyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Seonghoon Park</a>, <a href="/search/cs?searchtype=author&query=An%2C+H">Honggyu An</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungryong Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.10806v3-abstract-short" style="display: inline;"> We propose MaskingDepth, a novel semi-supervised learning framework for monocular depth estimation to mitigate the reliance on large ground-truth depth quantities. MaskingDepth is designed to enforce consistency between the strongly-augmented unlabeled data and the pseudo-labels derived from weakly-augmented unlabeled data, which enables learning depth without supervision. In this framework, a nov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10806v3-abstract-full').style.display = 'inline'; document.getElementById('2212.10806v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.10806v3-abstract-full" style="display: none;"> We propose MaskingDepth, a novel semi-supervised learning framework for monocular depth estimation to mitigate the reliance on large ground-truth depth quantities. MaskingDepth is designed to enforce consistency between the strongly-augmented unlabeled data and the pseudo-labels derived from weakly-augmented unlabeled data, which enables learning depth without supervision. In this framework, a novel data augmentation is proposed to take the advantage of a naive masking strategy as an augmentation, while avoiding its scale ambiguity problem between depths from weakly- and strongly-augmented branches and risk of missing small-scale instances. To only retain high-confident depth predictions from the weakly-augmented branch as pseudo-labels, we also present an uncertainty estimation technique, which is used to define robust consistency regularization. Experiments on KITTI and NYU-Depth-v2 datasets demonstrate the effectiveness of each component, its robustness to the use of fewer depth-annotated images, and superior performance compared to other state-of-the-art semi-supervised methods for monocular depth estimation. Furthermore, we show our method can be easily extended to domain adaptation task. Our code is available at https://github.com/KU-CVLAB/MaskingDepth. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10806v3-abstract-full').style.display = 'none'; document.getElementById('2212.10806v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://ku-cvlab.github.io/MaskingDepth/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.00790">arXiv:2212.00790</a> <span> [<a href="https://arxiv.org/pdf/2212.00790">pdf</a>, <a href="https://arxiv.org/format/2212.00790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sparsity Agnostic Depth Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Conti%2C+A">Andrea Conti</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.00790v1-abstract-short" style="display: inline;"> We present a novel depth completion approach agnostic to the sparsity of depth points, that is very likely to vary in many practical applications. State-of-the-art approaches yield accurate results only when processing a specific density and distribution of input points, i.e. the one observed during training, narrowing their deployment in real use cases. On the contrary, our solution is robust to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00790v1-abstract-full').style.display = 'inline'; document.getElementById('2212.00790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.00790v1-abstract-full" style="display: none;"> We present a novel depth completion approach agnostic to the sparsity of depth points, that is very likely to vary in many practical applications. State-of-the-art approaches yield accurate results only when processing a specific density and distribution of input points, i.e. the one observed during training, narrowing their deployment in real use cases. On the contrary, our solution is robust to uneven distributions and extremely low densities never witnessed during training. Experimental results on standard indoor and outdoor benchmarks highlight the robustness of our framework, achieving accuracy comparable to state-of-the-art methods when tested with density and distribution equal to the training one while being much more accurate in the other cases. Our pretrained models and further material are available in our project page. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00790v1-abstract-full').style.display = 'none'; document.getElementById('2212.00790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted for publication at the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), Waikoloa, 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.13762">arXiv:2211.13762</a> <span> [<a href="https://arxiv.org/pdf/2211.13762">pdf</a>, <a href="https://arxiv.org/format/2211.13762">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ScanNeRF: a Scalable Benchmark for Neural Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=De+Luigi%2C+L">Luca De Luigi</a>, <a href="/search/cs?searchtype=author&query=Bolognini%2C+D">Damiano Bolognini</a>, <a href="/search/cs?searchtype=author&query=Domeniconi%2C+F">Federico Domeniconi</a>, <a href="/search/cs?searchtype=author&query=De+Gregorio%2C+D">Daniele De Gregorio</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Di+Stefano%2C+L">Luigi Di Stefano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.13762v2-abstract-short" style="display: inline;"> In this paper, we propose the first-ever real benchmark thought for evaluating Neural Radiance Fields (NeRFs) and, in general, Neural Rendering (NR) frameworks. We design and implement an effective pipeline for scanning real objects in quantity and effortlessly. Our scan station is built with less than 500$ hardware budget and can collect roughly 4000 images of a scanned object in just 5 minutes.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13762v2-abstract-full').style.display = 'inline'; document.getElementById('2211.13762v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.13762v2-abstract-full" style="display: none;"> In this paper, we propose the first-ever real benchmark thought for evaluating Neural Radiance Fields (NeRFs) and, in general, Neural Rendering (NR) frameworks. We design and implement an effective pipeline for scanning real objects in quantity and effortlessly. Our scan station is built with less than 500$ hardware budget and can collect roughly 4000 images of a scanned object in just 5 minutes. Such a platform is used to build ScanNeRF, a dataset characterized by several train/val/test splits aimed at benchmarking the performance of modern NeRF methods under different conditions. Accordingly, we evaluate three cutting-edge NeRF variants on it to highlight their strengths and weaknesses. The dataset is available on our project page, together with an online benchmark to foster the development of better and better NeRFs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13762v2-abstract-full').style.display = 'none'; document.getElementById('2211.13762v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV 2023. The first three authors contributed equally. Project page: https://eyecan-ai.github.io/scannerf/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.13755">arXiv:2211.13755</a> <span> [<a href="https://arxiv.org/pdf/2211.13755">pdf</a>, <a href="https://arxiv.org/format/2211.13755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TemporalStereo: Efficient Spatial-Temporal Stereo Matching Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.13755v2-abstract-short" style="display: inline;"> We present TemporalStereo, a coarse-to-fine stereo matching network that is highly efficient, and able to effectively exploit the past geometry and context information to boost matching accuracy. Our network leverages sparse cost volume and proves to be effective when a single stereo pair is given. However, its peculiar ability to use spatio-temporal information across stereo sequences allows Temp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13755v2-abstract-full').style.display = 'inline'; document.getElementById('2211.13755v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.13755v2-abstract-full" style="display: none;"> We present TemporalStereo, a coarse-to-fine stereo matching network that is highly efficient, and able to effectively exploit the past geometry and context information to boost matching accuracy. Our network leverages sparse cost volume and proves to be effective when a single stereo pair is given. However, its peculiar ability to use spatio-temporal information across stereo sequences allows TemporalStereo to alleviate problems such as occlusions and reflective regions while enjoying high efficiency also in this latter case. Notably, our model -- trained once with stereo videos -- can run in both single-pair and temporal modes seamlessly. Experiments show that our network relying on camera motion is robust even to dynamic objects when running on videos. We validate TemporalStereo through extensive experiments on synthetic (SceneFlow, TartanAir) and real (KITTI 2012, KITTI 2015) datasets. Our model achieves state-of-the-art performance on any of these datasets. Code is available at \url{https://github.com/youmi-zym/TemporalStereo.git}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13755v2-abstract-full').style.display = 'none'; document.getElementById('2211.13755v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IROS 2023, Project page: https://youmi-zym.github.io/projects/TemporalStereo/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.12174">arXiv:2211.12174</a> <span> [<a href="https://arxiv.org/pdf/2211.12174">pdf</a>, <a href="https://arxiv.org/format/2211.12174">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Monocular Depth Estimation Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Spencer%2C+J">Jaime Spencer</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C+S">C. Stella Qian</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+C">Chris Russell</a>, <a href="/search/cs?searchtype=author&query=Hadfield%2C+S">Simon Hadfield</a>, <a href="/search/cs?searchtype=author&query=Graf%2C+E">Erich Graf</a>, <a href="/search/cs?searchtype=author&query=Adams%2C+W">Wendy Adams</a>, <a href="/search/cs?searchtype=author&query=Schofield%2C+A+J">Andrew J. Schofield</a>, <a href="/search/cs?searchtype=author&query=Elder%2C+J">James Elder</a>, <a href="/search/cs?searchtype=author&query=Bowden%2C+R">Richard Bowden</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+H">Heng Cong</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Suri%2C+Z+K">Zeeshan Khan Suri</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yang Tang</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yusheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chaoqiang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.12174v1-abstract-short" style="display: inline;"> This paper summarizes the results of the first Monocular Depth Estimation Challenge (MDEC) organized at WACV2023. This challenge evaluated the progress of self-supervised monocular depth estimation on the challenging SYNS-Patches dataset. The challenge was organized on CodaLab and received submissions from 4 valid teams. Participants were provided a devkit containing updated reference implementati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12174v1-abstract-full').style.display = 'inline'; document.getElementById('2211.12174v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.12174v1-abstract-full" style="display: none;"> This paper summarizes the results of the first Monocular Depth Estimation Challenge (MDEC) organized at WACV2023. This challenge evaluated the progress of self-supervised monocular depth estimation on the challenging SYNS-Patches dataset. The challenge was organized on CodaLab and received submissions from 4 valid teams. Participants were provided a devkit containing updated reference implementations for 16 State-of-the-Art algorithms and 4 novel techniques. The threshold for acceptance for novel techniques was to outperform every one of the 16 SotA baselines. All participants outperformed the baseline in traditional metrics such as MAE or AbsRel. However, pointcloud reconstruction metrics were challenging to improve upon. We found predictions were characterized by interpolation artefacts at object boundaries and errors in relative object positioning. We hope this challenge is a valuable contribution to the community and encourage authors to participate in future editions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12174v1-abstract-full').style.display = 'none'; document.getElementById('2211.12174v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV-Workshops 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.11467">arXiv:2210.11467</a> <span> [<a href="https://arxiv.org/pdf/2210.11467">pdf</a>, <a href="https://arxiv.org/format/2210.11467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-View Guided Multi-View Stereo </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Conti%2C+A">Andrea Conti</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.11467v1-abstract-short" style="display: inline;"> This paper introduces a novel deep framework for dense 3D reconstruction from multiple image frames, leveraging a sparse set of depth measurements gathered jointly with image acquisition. Given a deep multi-view stereo network, our framework uses sparse depth hints to guide the neural network by modulating the plane-sweep cost volume built during the forward step, enabling us to infer constantly m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11467v1-abstract-full').style.display = 'inline'; document.getElementById('2210.11467v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.11467v1-abstract-full" style="display: none;"> This paper introduces a novel deep framework for dense 3D reconstruction from multiple image frames, leveraging a sparse set of depth measurements gathered jointly with image acquisition. Given a deep multi-view stereo network, our framework uses sparse depth hints to guide the neural network by modulating the plane-sweep cost volume built during the forward step, enabling us to infer constantly much more accurate depth maps. Moreover, since multiple viewpoints can provide additional depth measurements, we propose a multi-view guidance strategy that increases the density of the sparse points used to guide the network, thus leading to even more accurate results. We evaluate our Multi-View Guided framework within a variety of state-of-the-art deep multi-view stereo networks, demonstrating its effectiveness at improving the results achieved by each of them on BlendedMVG and DTU datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11467v1-abstract-full').style.display = 'none'; document.getElementById('2210.11467v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IROS 2022. First two authors contributed equally. Project page: https://github.com/andreaconti/multi-view-guided-multi-view-stereo</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.03118">arXiv:2210.03118</a> <span> [<a href="https://arxiv.org/pdf/2210.03118">pdf</a>, <a href="https://arxiv.org/format/2210.03118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised confidence for LiDAR depth maps and applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Conti%2C+A">Andrea Conti</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Aleotti%2C+F">Filippo Aleotti</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.03118v1-abstract-short" style="display: inline;"> Depth perception is pivotal in many fields, such as robotics and autonomous driving, to name a few. Consequently, depth sensors such as LiDARs rapidly spread in many applications. The 3D point clouds generated by these sensors must often be coupled with an RGB camera to understand the framed scene semantically. Usually, the former is projected over the camera image plane, leading to a sparse depth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03118v1-abstract-full').style.display = 'inline'; document.getElementById('2210.03118v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.03118v1-abstract-full" style="display: none;"> Depth perception is pivotal in many fields, such as robotics and autonomous driving, to name a few. Consequently, depth sensors such as LiDARs rapidly spread in many applications. The 3D point clouds generated by these sensors must often be coupled with an RGB camera to understand the framed scene semantically. Usually, the former is projected over the camera image plane, leading to a sparse depth map. Unfortunately, this process, coupled with the intrinsic issues affecting all the depth sensors, yields noise and gross outliers in the final output. Purposely, in this paper, we propose an effective unsupervised framework aimed at explicitly addressing this issue by learning to estimate the confidence of the LiDAR sparse depth map and thus allowing for filtering out the outliers. Experimental results on the KITTI dataset highlight that our framework excels for this purpose. Moreover, we demonstrate how this achievement can improve a wide range of tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03118v1-abstract-full').style.display = 'none'; document.getElementById('2210.03118v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IROS 2022. Code available at https://github.com/andreaconti/lidar-confidence</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.00648">arXiv:2209.00648</a> <span> [<a href="https://arxiv.org/pdf/2209.00648">pdf</a>, <a href="https://arxiv.org/format/2209.00648">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-Spectral Neural Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Ramirez%2C+P+Z">Pierluigi Zama Ramirez</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Salti%2C+S">Samuele Salti</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Di+Stefano%2C+L">Luigi Di Stefano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.00648v1-abstract-short" style="display: inline;"> We propose X-NeRF, a novel method to learn a Cross-Spectral scene representation given images captured from cameras with different light spectrum sensitivity, based on the Neural Radiance Fields formulation. X-NeRF optimizes camera poses across spectra during training and exploits Normalized Cross-Device Coordinates (NXDC) to render images of different modalities from arbitrary viewpoints, which a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.00648v1-abstract-full').style.display = 'inline'; document.getElementById('2209.00648v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.00648v1-abstract-full" style="display: none;"> We propose X-NeRF, a novel method to learn a Cross-Spectral scene representation given images captured from cameras with different light spectrum sensitivity, based on the Neural Radiance Fields formulation. X-NeRF optimizes camera poses across spectra during training and exploits Normalized Cross-Device Coordinates (NXDC) to render images of different modalities from arbitrary viewpoints, which are aligned and at the same resolution. Experiments on 16 forward-facing scenes, featuring color, multi-spectral and infrared images, confirm the effectiveness of X-NeRF at modeling Cross-Spectral scene representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.00648v1-abstract-full').style.display = 'none'; document.getElementById('2209.00648v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3DV 2022. Project page: https://cvlab-unibo.github.io/xnerf-web/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.03543">arXiv:2208.03543</a> <span> [<a href="https://arxiv.org/pdf/2208.03543">pdf</a>, <a href="https://arxiv.org/format/2208.03543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/3DV57658.2022.00077">10.1109/3DV57658.2022.00077 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MonoViT: Self-Supervised Monocular Depth Estimation with a Vision Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chaoqiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xianda Guo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+G">Guan Huang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yang Tang</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.03543v1-abstract-short" style="display: inline;"> Self-supervised monocular depth estimation is an attractive solution that does not require hard-to-source depth labels for training. Convolutional neural networks (CNNs) have recently achieved great success in this task. However, their limited receptive field constrains existing network architectures to reason only locally, dampening the effectiveness of the self-supervised paradigm. In the light… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03543v1-abstract-full').style.display = 'inline'; document.getElementById('2208.03543v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.03543v1-abstract-full" style="display: none;"> Self-supervised monocular depth estimation is an attractive solution that does not require hard-to-source depth labels for training. Convolutional neural networks (CNNs) have recently achieved great success in this task. However, their limited receptive field constrains existing network architectures to reason only locally, dampening the effectiveness of the self-supervised paradigm. In the light of the recent successes achieved by Vision Transformers (ViTs), we propose MonoViT, a brand-new framework combining the global reasoning enabled by ViT models with the flexibility of self-supervised monocular depth estimation. By combining plain convolutions with Transformer blocks, our model can reason locally and globally, yielding depth prediction at a higher level of detail and accuracy, allowing MonoViT to achieve state-of-the-art performance on the established KITTI dataset. Moreover, MonoViT proves its superior generalization capacities on other datasets such as Make3D and DrivingStereo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03543v1-abstract-full').style.display = 'none'; document.getElementById('2208.03543v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 3DV 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.10667">arXiv:2207.10667</a> <span> [<a href="https://arxiv.org/pdf/2207.10667">pdf</a>, <a href="https://arxiv.org/format/2207.10667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Online Domain Adaptation for Semantic Segmentation in Ever-Changing Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Panagiotakopoulos%2C+T">Theodoros Panagiotakopoulos</a>, <a href="/search/cs?searchtype=author&query=Dovesi%2C+P+L">Pier Luigi Dovesi</a>, <a href="/search/cs?searchtype=author&query=H%C3%A4renstam-Nielsen%2C+L">Linus H盲renstam-Nielsen</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.10667v1-abstract-short" style="display: inline;"> Unsupervised Domain Adaptation (UDA) aims at reducing the domain gap between training and testing data and is, in most cases, carried out in offline manner. However, domain changes may occur continuously and unpredictably during deployment (e.g. sudden weather changes). In such conditions, deep neural networks witness dramatic drops in accuracy and offline adaptation may not be enough to contrast… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.10667v1-abstract-full').style.display = 'inline'; document.getElementById('2207.10667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.10667v1-abstract-full" style="display: none;"> Unsupervised Domain Adaptation (UDA) aims at reducing the domain gap between training and testing data and is, in most cases, carried out in offline manner. However, domain changes may occur continuously and unpredictably during deployment (e.g. sudden weather changes). In such conditions, deep neural networks witness dramatic drops in accuracy and offline adaptation may not be enough to contrast it. In this paper, we tackle Online Domain Adaptation (OnDA) for semantic segmentation. We design a pipeline that is robust to continuous domain shifts, either gradual or sudden, and we evaluate it in the case of rainy and foggy scenarios. Our experiments show that our framework can effectively adapt to new domains during deployment, while not being affected by catastrophic forgetting of the previous domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.10667v1-abstract-full').style.display = 'none'; document.getElementById('2207.10667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2022. Project page: https://theo2021.github.io/onda-web/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.07047">arXiv:2206.07047</a> <span> [<a href="https://arxiv.org/pdf/2206.07047">pdf</a>, <a href="https://arxiv.org/format/2206.07047">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RGB-Multispectral Matching: Dataset, Learning Methodology, Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Ramirez%2C+P+Z">Pierluigi Zama Ramirez</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Salti%2C+S">Samuele Salti</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Di+Stefano%2C+L">Luigi Di Stefano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.07047v1-abstract-short" style="display: inline;"> We address the problem of registering synchronized color (RGB) and multi-spectral (MS) images featuring very different resolution by solving stereo matching correspondences. Purposely, we introduce a novel RGB-MS dataset framing 13 different scenes in indoor environments and providing a total of 34 image pairs annotated with semi-dense, high-resolution ground-truth labels in the form of disparity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07047v1-abstract-full').style.display = 'inline'; document.getElementById('2206.07047v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.07047v1-abstract-full" style="display: none;"> We address the problem of registering synchronized color (RGB) and multi-spectral (MS) images featuring very different resolution by solving stereo matching correspondences. Purposely, we introduce a novel RGB-MS dataset framing 13 different scenes in indoor environments and providing a total of 34 image pairs annotated with semi-dense, high-resolution ground-truth labels in the form of disparity maps. To tackle the task, we propose a deep learning architecture trained in a self-supervised manner by exploiting a further RGB camera, required only during training data acquisition. In this setup, we can conveniently learn cross-modal matching in the absence of ground-truth labels by distilling knowledge from an easier RGB-RGB matching task based on a collection of about 11K unlabeled image triplets. Experiments show that the proposed pipeline sets a good performance bar (1.16 pixels average registration error) for future research on this novel, challenging task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07047v1-abstract-full').style.display = 'none'; document.getElementById('2206.07047v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2022, New Orleans. Project page: https://cvlab-unibo.github.io/rgb-ms-web/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.04671">arXiv:2206.04671</a> <span> [<a href="https://arxiv.org/pdf/2206.04671">pdf</a>, <a href="https://arxiv.org/format/2206.04671">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Open Challenges in Deep Stereo: the Booster Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ramirez%2C+P+Z">Pierluigi Zama Ramirez</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Salti%2C+S">Samuele Salti</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Di+Stefano%2C+L">Luigi Di Stefano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.04671v1-abstract-short" style="display: inline;"> We present a novel high-resolution and challenging stereo dataset framing indoor scenes annotated with dense and accurate ground-truth disparities. Peculiar to our dataset is the presence of several specular and transparent surfaces, i.e. the main causes of failures for state-of-the-art stereo networks. Our acquisition pipeline leverages a novel deep space-time stereo framework which allows for ea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04671v1-abstract-full').style.display = 'inline'; document.getElementById('2206.04671v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.04671v1-abstract-full" style="display: none;"> We present a novel high-resolution and challenging stereo dataset framing indoor scenes annotated with dense and accurate ground-truth disparities. Peculiar to our dataset is the presence of several specular and transparent surfaces, i.e. the main causes of failures for state-of-the-art stereo networks. Our acquisition pipeline leverages a novel deep space-time stereo framework which allows for easy and accurate labeling with sub-pixel precision. We release a total of 419 samples collected in 64 different scenes and annotated with dense ground-truth disparities. Each sample include a high-resolution pair (12 Mpx) as well as an unbalanced pair (Left: 12 Mpx, Right: 1.1 Mpx). Additionally, we provide manually annotated material segmentation masks and 15K unlabeled samples. We evaluate state-of-the-art deep networks based on our dataset, highlighting their limitations in addressing the open challenges in stereo and drawing hints for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04671v1-abstract-full').style.display = 'none'; document.getElementById('2206.04671v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2022, New Orleans. Project page: https://cvlab-unibo.github.io/booster-web/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.02714">arXiv:2206.02714</a> <span> [<a href="https://arxiv.org/pdf/2206.02714">pdf</a>, <a href="https://arxiv.org/format/2206.02714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FuSS: Fusing Superpixels for Improved Segmentation Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nunes%2C+I">Ian Nunes</a>, <a href="/search/cs?searchtype=author&query=Pereira%2C+M+B">Matheus B. Pereira</a>, <a href="/search/cs?searchtype=author&query=Oliveira%2C+H">Hugo Oliveira</a>, <a href="/search/cs?searchtype=author&query=Santos%2C+J+A+D">Jefersson A. Dos Santos</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Marcus Poggi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.02714v1-abstract-short" style="display: inline;"> In this work, we propose two different approaches to improve the semantic consistency of Open Set Semantic Segmentation. First, we propose a method called OpenGMM that extends the OpenPCS framework using a Gaussian Mixture of Models to model the distribution of pixels for each class in a multimodal manner. The second approach is a post-processing which uses superpixels to enforce highly homogeneou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.02714v1-abstract-full').style.display = 'inline'; document.getElementById('2206.02714v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.02714v1-abstract-full" style="display: none;"> In this work, we propose two different approaches to improve the semantic consistency of Open Set Semantic Segmentation. First, we propose a method called OpenGMM that extends the OpenPCS framework using a Gaussian Mixture of Models to model the distribution of pixels for each class in a multimodal manner. The second approach is a post-processing which uses superpixels to enforce highly homogeneous regions to behave equally, rectifying erroneous classified pixels within these regions, we also proposed a novel superpixel method called FuSS. All tests were performed on ISPRS Vaihingen and Potsdam datasets, and both methods were capable to improve quantitative and qualitative results for both datasets. Besides that, the post-process with FuSS achieved state-of-the-art results for both datasets. The official implementation is available at: \url{https://github.com/iannunes/FuSS}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.02714v1-abstract-full').style.display = 'none'; document.getElementById('2206.02714v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to IEEEACCESS. 19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.01693">arXiv:2204.01693</a> <span> [<a href="https://arxiv.org/pdf/2204.01693">pdf</a>, <a href="https://arxiv.org/format/2204.01693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Monitoring social distancing with single image depth estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mingozzi%2C+A">Alessio Mingozzi</a>, <a href="/search/cs?searchtype=author&query=Conti%2C+A">Andrea Conti</a>, <a href="/search/cs?searchtype=author&query=Aleotti%2C+F">Filippo Aleotti</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.01693v2-abstract-short" style="display: inline;"> The recent pandemic emergency raised many challenges regarding the countermeasures aimed at containing the virus spread, and constraining the minimum distance between people resulted in one of the most effective strategies. Thus, the implementation of autonomous systems capable of monitoring the so-called social distance gained much interest. In this paper, we aim to address this task leveraging a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.01693v2-abstract-full').style.display = 'inline'; document.getElementById('2204.01693v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.01693v2-abstract-full" style="display: none;"> The recent pandemic emergency raised many challenges regarding the countermeasures aimed at containing the virus spread, and constraining the minimum distance between people resulted in one of the most effective strategies. Thus, the implementation of autonomous systems capable of monitoring the so-called social distance gained much interest. In this paper, we aim to address this task leveraging a single RGB frame without additional depth sensors. In contrast to existing single-image alternatives failing when ground localization is not available, we rely on single image depth estimation to perceive the 3D structure of the observed scene and estimate the distance between people. During the setup phase, a straightforward calibration procedure, leveraging a scale-aware SLAM algorithm available even on consumer smartphones, allows us to address the scale ambiguity affecting single image depth estimation. We validate our approach through indoor and outdoor images employing a calibrated LiDAR + RGB camera asset. Experimental results highlight that our proposal enables sufficiently reliable estimation of the inter-personal distance to monitor social distancing effectively. This fact confirms that despite its intrinsic ambiguity, if appropriately driven single image depth estimation can be a viable alternative to other depth perception techniques, more expensive and not always feasible in practical applications. Our evaluation also highlights that our framework can run reasonably fast and comparably to competitors, even on pure CPU systems. Moreover, its practical deployment on low-power systems is around the corner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.01693v2-abstract-full').style.display = 'none'; document.getElementById('2204.01693v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for pubblication on IEEE Transactions on Emerging Topics in Computational Intelligence (TETCI)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.01368">arXiv:2203.01368</a> <span> [<a href="https://arxiv.org/pdf/2203.01368">pdf</a>, <a href="https://arxiv.org/format/2203.01368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Conditional Reconstruction for Open-set Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nunes%2C+I">Ian Nunes</a>, <a href="/search/cs?searchtype=author&query=Pereira%2C+M+B">Matheus B. Pereira</a>, <a href="/search/cs?searchtype=author&query=Oliveira%2C+H">Hugo Oliveira</a>, <a href="/search/cs?searchtype=author&query=Santos%2C+J+A+d">Jefersson A. dos Santos</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Marcus Poggi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.01368v1-abstract-short" style="display: inline;"> Open set segmentation is a relatively new and unexploredtask, with just a handful of methods proposed to model suchtasks.We propose a novel method called CoReSeg thattackles the issue using class conditional reconstruction ofthe input images according to their pixelwise mask. Ourmethod conditions each input pixel to all known classes,expecting higher errors for pixels of unknown classes. Itwas obs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.01368v1-abstract-full').style.display = 'inline'; document.getElementById('2203.01368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.01368v1-abstract-full" style="display: none;"> Open set segmentation is a relatively new and unexploredtask, with just a handful of methods proposed to model suchtasks.We propose a novel method called CoReSeg thattackles the issue using class conditional reconstruction ofthe input images according to their pixelwise mask. Ourmethod conditions each input pixel to all known classes,expecting higher errors for pixels of unknown classes. Itwas observed that the proposed method produces better se-mantic consistency in its predictions, resulting in cleanersegmentation maps that better fit object boundaries. CoRe-Seg outperforms state-of-the-art methods on the Vaihin-gen and Potsdam ISPRS datasets, while also being com-petitive on the Houston 2018 IEEE GRSS Data Fusiondataset. Official implementation for CoReSeg is availableat:https://github.com/iannunes/CoReSeg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.01368v1-abstract-full').style.display = 'none'; document.getElementById('2203.01368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.15367">arXiv:2110.15367</a> <span> [<a href="https://arxiv.org/pdf/2110.15367">pdf</a>, <a href="https://arxiv.org/format/2110.15367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neural Disparity Refinement for Arbitrary Resolution Stereo </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aleotti%2C+F">Filippo Aleotti</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Ramirez%2C+P+Z">Pierluigi Zama Ramirez</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Salti%2C+S">Samuele Salti</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Di+Stefano%2C+L">Luigi Di Stefano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.15367v1-abstract-short" style="display: inline;"> We introduce a novel architecture for neural disparity refinement aimed at facilitating deployment of 3D computer vision on cheap and widespread consumer devices, such as mobile phones. Our approach relies on a continuous formulation that enables to estimate a refined disparity map at any arbitrary output resolution. Thereby, it can handle effectively the unbalanced camera setup typical of nowaday… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.15367v1-abstract-full').style.display = 'inline'; document.getElementById('2110.15367v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.15367v1-abstract-full" style="display: none;"> We introduce a novel architecture for neural disparity refinement aimed at facilitating deployment of 3D computer vision on cheap and widespread consumer devices, such as mobile phones. Our approach relies on a continuous formulation that enables to estimate a refined disparity map at any arbitrary output resolution. Thereby, it can handle effectively the unbalanced camera setup typical of nowadays mobile phones, which feature both high and low resolution RGB sensors within the same device. Moreover, our neural network can process seamlessly the output of a variety of stereo methods and, by refining the disparity maps computed by a traditional matching algorithm like SGM, it can achieve unpaired zero-shot generalization performance compared to state-of-the-art end-to-end stereo models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.15367v1-abstract-full').style.display = 'none'; document.getElementById('2110.15367v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3DV 2021 Oral paper. Project page: https://cvlab-unibo.github.io/neural-disparity-refinement-web</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.15321">arXiv:2109.15321</a> <span> [<a href="https://arxiv.org/pdf/2109.15321">pdf</a>, <a href="https://arxiv.org/format/2109.15321">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sensor-Guided Optical Flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Aleotti%2C+F">Filippo Aleotti</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.15321v1-abstract-short" style="display: inline;"> This paper proposes a framework to guide an optical flow network with external cues to achieve superior accuracy either on known or unseen domains. Given the availability of sparse yet accurate optical flow hints from an external source, these are injected to modulate the correlation scores computed by a state-of-the-art optical flow network and guide it towards more accurate predictions. Although… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.15321v1-abstract-full').style.display = 'inline'; document.getElementById('2109.15321v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.15321v1-abstract-full" style="display: none;"> This paper proposes a framework to guide an optical flow network with external cues to achieve superior accuracy either on known or unseen domains. Given the availability of sparse yet accurate optical flow hints from an external source, these are injected to modulate the correlation scores computed by a state-of-the-art optical flow network and guide it towards more accurate predictions. Although no real sensor can provide sparse flow hints, we show how these can be obtained by combining depth measurements from active sensors with geometry and hand-crafted optical flow algorithms, leading to accurate enough hints for our purpose. Experimental results with a state-of-the-art flow network on standard benchmarks support the effectiveness of our framework, both in simulated and real conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.15321v1-abstract-full').style.display = 'none'; document.getElementById('2109.15321v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.03965">arXiv:2104.03965</a> <span> [<a href="https://arxiv.org/pdf/2104.03965">pdf</a>, <a href="https://arxiv.org/format/2104.03965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning optical flow from still images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aleotti%2C+F">Filippo Aleotti</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.03965v1-abstract-short" style="display: inline;"> This paper deals with the scarcity of data for training optical flow networks, highlighting the limitations of existing sources such as labeled synthetic datasets or unlabeled real videos. Specifically, we introduce a framework to generate accurate ground-truth optical flow annotations quickly and in large amounts from any readily available single real picture. Given an image, we use an off-the-sh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.03965v1-abstract-full').style.display = 'inline'; document.getElementById('2104.03965v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.03965v1-abstract-full" style="display: none;"> This paper deals with the scarcity of data for training optical flow networks, highlighting the limitations of existing sources such as labeled synthetic datasets or unlabeled real videos. Specifically, we introduce a framework to generate accurate ground-truth optical flow annotations quickly and in large amounts from any readily available single real picture. Given an image, we use an off-the-shelf monocular depth estimation network to build a plausible point cloud for the observed scene. Then, we virtually move the camera in the reconstructed environment with known motion vectors and rotation angles, allowing us to synthesize both a novel view and the corresponding optical flow field connecting each pixel in the input image to the one in the new frame. When trained with our data, state-of-the-art optical flow networks achieve superior generalization to unseen real data compared to the same models trained either on annotated synthetic datasets or unlabeled videos, and better specialization if combined with synthetic images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.03965v1-abstract-full').style.display = 'none'; document.getElementById('2104.03965v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2021. Project page with supplementary and code: https://mattpoggi.github.io/projects/cvpr2021aleotti/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.00431">arXiv:2101.00431</a> <span> [<a href="https://arxiv.org/pdf/2101.00431">pdf</a>, <a href="https://arxiv.org/format/2101.00431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> On the confidence of stereo matching in a deep-learning era: a quantitative evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungryong Kim</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sunok Kim</a>, <a href="/search/cs?searchtype=author&query=Aleotti%2C+F">Filippo Aleotti</a>, <a href="/search/cs?searchtype=author&query=Min%2C+D">Dongbo Min</a>, <a href="/search/cs?searchtype=author&query=Sohn%2C+K">Kwanghoon Sohn</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.00431v3-abstract-short" style="display: inline;"> Stereo matching is one of the most popular techniques to estimate dense depth maps by finding the disparity between matching pixels on two, synchronized and rectified images. Alongside with the development of more accurate algorithms, the research community focused on finding good strategies to estimate the reliability, i.e. the confidence, of estimated disparity maps. This information proves to b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.00431v3-abstract-full').style.display = 'inline'; document.getElementById('2101.00431v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.00431v3-abstract-full" style="display: none;"> Stereo matching is one of the most popular techniques to estimate dense depth maps by finding the disparity between matching pixels on two, synchronized and rectified images. Alongside with the development of more accurate algorithms, the research community focused on finding good strategies to estimate the reliability, i.e. the confidence, of estimated disparity maps. This information proves to be a powerful cue to naively find wrong matches as well as to improve the overall effectiveness of a variety of stereo algorithms according to different strategies. In this paper, we review more than ten years of developments in the field of confidence estimation for stereo matching. We extensively discuss and evaluate existing confidence measures and their variants, from hand-crafted ones to the most recent, state-of-the-art learning based methods. We study the different behaviors of each measure when applied to a pool of different stereo algorithms and, for the first time in literature, when paired with a state-of-the-art deep stereo network. Our experiments, carried out on five different standard datasets, provide a comprehensive overview of the field, highlighting in particular both strengths and limitations of learning-based strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.00431v3-abstract-full').style.display = 'none'; document.getElementById('2101.00431v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">TPAMI final version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.07347">arXiv:2010.07347</a> <span> [<a href="https://arxiv.org/pdf/2010.07347">pdf</a>, <a href="https://arxiv.org/format/2010.07347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Matching-space Stereo Networks for Cross-domain Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+C">Changjiang Cai</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Mordohai%2C+P">Philippos Mordohai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.07347v1-abstract-short" style="display: inline;"> End-to-end deep networks represent the state of the art for stereo matching. While excelling on images framing environments similar to the training set, major drops in accuracy occur in unseen domains (e.g., when moving from synthetic to real scenes). In this paper we introduce a novel family of architectures, namely Matching-Space Networks (MS-Nets), with improved generalization properties. By re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.07347v1-abstract-full').style.display = 'inline'; document.getElementById('2010.07347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.07347v1-abstract-full" style="display: none;"> End-to-end deep networks represent the state of the art for stereo matching. While excelling on images framing environments similar to the training set, major drops in accuracy occur in unseen domains (e.g., when moving from synthetic to real scenes). In this paper we introduce a novel family of architectures, namely Matching-Space Networks (MS-Nets), with improved generalization properties. By replacing learning-based feature extraction from image RGB values with matching functions and confidence measures from conventional wisdom, we move the learning process from the color space to the Matching Space, avoiding over-specialization to domain specific features. Extensive experimental results on four real datasets highlight that our proposal leads to superior generalization to unseen environments over conventional deep architectures, keeping accuracy on the source domain almost unaltered. Our code is available at https://github.com/ccj5351/MS-Nets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.07347v1-abstract-full').style.display = 'none'; document.getElementById('2010.07347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 8 figures, International Conference on 3D Vision (3DV'2020), Github code at https://github.com/ccj5351/MS-Nets</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.07130">arXiv:2008.07130</a> <span> [<a href="https://arxiv.org/pdf/2008.07130">pdf</a>, <a href="https://arxiv.org/format/2008.07130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reversing the cycle: self-supervised deep stereo through enhanced monocular distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aleotti%2C+F">Filippo Aleotti</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.07130v1-abstract-short" style="display: inline;"> In many fields, self-supervised learning solutions are rapidly evolving and filling the gap with supervised approaches. This fact occurs for depth estimation based on either monocular or stereo, with the latter often providing a valid source of self-supervision for the former. In contrast, to soften typical stereo artefacts, we propose a novel self-supervised paradigm reversing the link between th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.07130v1-abstract-full').style.display = 'inline'; document.getElementById('2008.07130v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.07130v1-abstract-full" style="display: none;"> In many fields, self-supervised learning solutions are rapidly evolving and filling the gap with supervised approaches. This fact occurs for depth estimation based on either monocular or stereo, with the latter often providing a valid source of self-supervision for the former. In contrast, to soften typical stereo artefacts, we propose a novel self-supervised paradigm reversing the link between the two. Purposely, in order to train deep stereo networks, we distill knowledge through a monocular completion network. This architecture exploits single-image clues and few sparse points, sourced by traditional stereo algorithms, to estimate dense yet accurate disparity maps by means of a consensus mechanism over multiple estimations. We thoroughly evaluate with popular stereo datasets the impact of different supervisory signals showing how stereo networks trained with our paradigm outperform existing self-supervised frameworks. Finally, our proposal achieves notable generalization capabilities dealing with domain shift issues. Code available at https://github.com/FilippoAleotti/Reversing <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.07130v1-abstract-full').style.display = 'none'; document.getElementById('2008.07130v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.06447">arXiv:2008.06447</a> <span> [<a href="https://arxiv.org/pdf/2008.06447">pdf</a>, <a href="https://arxiv.org/format/2008.06447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-adapting confidence estimation for stereo </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Aleotti%2C+F">Filippo Aleotti</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Zaccaroni%2C+G">Giulio Zaccaroni</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.06447v2-abstract-short" style="display: inline;"> Estimating the confidence of disparity maps inferred by a stereo algorithm has become a very relevant task in the years, due to the increasing number of applications leveraging such cue. Although self-supervised learning has recently spread across many computer vision tasks, it has been barely considered in the field of confidence estimation. In this paper, we propose a flexible and lightweight so… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.06447v2-abstract-full').style.display = 'inline'; document.getElementById('2008.06447v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.06447v2-abstract-full" style="display: none;"> Estimating the confidence of disparity maps inferred by a stereo algorithm has become a very relevant task in the years, due to the increasing number of applications leveraging such cue. Although self-supervised learning has recently spread across many computer vision tasks, it has been barely considered in the field of confidence estimation. In this paper, we propose a flexible and lightweight solution enabling self-adapting confidence estimation agnostic to the stereo algorithm or network. Our approach relies on the minimum information available in any stereo setup (i.e., the input stereo pair and the output disparity map) to learn an effective confidence measure. This strategy allows us not only a seamless integration with any stereo system, including consumer and industrial devices equipped with undisclosed stereo perception methods, but also, due to its self-adapting capability, for its out-of-the-box deployment in the field. Exhaustive experimental results with different standard datasets support our claims, showing how our solution is the first-ever enabling online learning of accurate confidence estimation for any stereo system and without any requirement for the end-user. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.06447v2-abstract-full').style.display = 'none'; document.getElementById('2008.06447v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2020 (errata corrige: eq.6, k domain)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.05233">arXiv:2007.05233</a> <span> [<a href="https://arxiv.org/pdf/2007.05233">pdf</a>, <a href="https://arxiv.org/format/2007.05233">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Continual Adaptation for Deep Stereo </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Tonioni%2C+A">Alessio Tonioni</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a>, <a href="/search/cs?searchtype=author&query=Di+Stefano%2C+L">Luigi Di Stefano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.05233v3-abstract-short" style="display: inline;"> Depth estimation from stereo images is carried out with unmatched results by convolutional neural networks trained end-to-end to regress dense disparities. Like for most tasks, this is possible if large amounts of labelled samples are available for training, possibly covering the whole data distribution encountered at deployment time. Being such an assumption systematically unmet in real applicati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05233v3-abstract-full').style.display = 'inline'; document.getElementById('2007.05233v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.05233v3-abstract-full" style="display: none;"> Depth estimation from stereo images is carried out with unmatched results by convolutional neural networks trained end-to-end to regress dense disparities. Like for most tasks, this is possible if large amounts of labelled samples are available for training, possibly covering the whole data distribution encountered at deployment time. Being such an assumption systematically unmet in real applications, the capacity of adapting to any unseen setting becomes of paramount importance. Purposely, we propose a continual adaptation paradigm for deep stereo networks designed to deal with challenging and ever-changing environments. We design a lightweight and modular architecture, Modularly ADaptive Network (MADNet), and formulate Modular ADaptation algorithms (MAD, MAD++) which permit efficient optimization of independent sub-portions of the entire network. In our paradigm, the learning signals needed to continuously adapt models online can be sourced from self-supervision via right-to-left image warping or from traditional stereo algorithms. With both sources, no other data than the input images being gathered at deployment time are needed. Thus, our network architecture and adaptation algorithms realize the first real-time self-adaptive deep stereo system and pave the way for a new paradigm that can facilitate practical deployment of end-to-end architectures for dense disparity regression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05233v3-abstract-full').style.display = 'none'; document.getElementById('2007.05233v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended version of CVPR 2019 paper "Real-time self-adaptive deep stereo" - Accepted to TPAMI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.05724">arXiv:2006.05724</a> <span> [<a href="https://arxiv.org/pdf/2006.05724">pdf</a>, <a href="https://arxiv.org/format/2006.05724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Real-time single image depth perception in the wild with handheld devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aleotti%2C+F">Filippo Aleotti</a>, <a href="/search/cs?searchtype=author&query=Zaccaroni%2C+G">Giulio Zaccaroni</a>, <a href="/search/cs?searchtype=author&query=Bartolomei%2C+L">Luca Bartolomei</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Tosi%2C+F">Fabio Tosi</a>, <a href="/search/cs?searchtype=author&query=Mattoccia%2C+S">Stefano Mattoccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.05724v1-abstract-short" style="display: inline;"> Depth perception is paramount to tackle real-world problems, ranging from autonomous driving to consumer applications. For the latter, depth estimation from a single image represents the most versatile solution, since a standard camera is available on almost any handheld device. Nonetheless, two main issues limit its practical deployment: i) the low reliability when deployed in-the-wild and ii) th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.05724v1-abstract-full').style.display = 'inline'; document.getElementById('2006.05724v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.05724v1-abstract-full" style="display: none;"> Depth perception is paramount to tackle real-world problems, ranging from autonomous driving to consumer applications. For the latter, depth estimation from a single image represents the most versatile solution, since a standard camera is available on almost any handheld device. Nonetheless, two main issues limit its practical deployment: i) the low reliability when deployed in-the-wild and ii) the demanding resource requirements to achieve real-time performance, often not compatible with such devices. Therefore, in this paper, we deeply investigate these issues showing how they are both addressable adopting appropriate network design and training strategies -- also outlining how to map the resulting networks on handheld devices to achieve real-time performance. Our thorough evaluation highlights the ability of such fast networks to generalize well to new environments, a crucial feature required to tackle the extremely varied contexts faced in real applications. Indeed, to further support this evidence, we report experimental results concerning real-time depth-aware augmented reality and image blurring with smartphones in-the-wild. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.05724v1-abstract-full').style.display = 'none'; document.getElementById('2006.05724v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 9 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Poggi%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Poggi%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Poggi%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>