CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 57 results for author: <span class="mathjax">Gilitschenski, I</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Gilitschenski%2C+I">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Gilitschenski, I"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Gilitschenski%2C+I&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Gilitschenski, I"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Gilitschenski%2C+I&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Gilitschenski%2C+I&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Gilitschenski%2C+I&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07555">arXiv:2411.07555</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07555">pdf</a>, <a href="https://arxiv.org/format/2411.07555">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GaussianCut: Interactive segmentation via graph cut for 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jain%2C+U">Umangi Jain</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzaei%2C+A">Ashkan Mirzaei</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07555v1-abstract-short" style="display: inline;"> We introduce GaussianCut, a new method for interactive multiview segmentation of scenes represented as 3D Gaussians. Our approach allows for selecting the objects to be segmented by interacting with a single view. It accepts intuitive user input, such as point clicks, coarse scribbles, or text. Using 3D Gaussian Splatting (3DGS) as the underlying scene representation simplifies the extraction of o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07555v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07555v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07555v1-abstract-full" style="display: none;"> We introduce GaussianCut, a new method for interactive multiview segmentation of scenes represented as 3D Gaussians. Our approach allows for selecting the objects to be segmented by interacting with a single view. It accepts intuitive user input, such as point clicks, coarse scribbles, or text. Using 3D Gaussian Splatting (3DGS) as the underlying scene representation simplifies the extraction of objects of interest which are considered to be a subset of the scene&#39;s Gaussians. Our key idea is to represent the scene as a graph and use the graph-cut algorithm to minimize an energy function to effectively partition the Gaussians into foreground and background. To achieve this, we construct a graph based on scene Gaussians and devise a segmentation-aligned energy function on the graph to combine user inputs with scene properties. To obtain an initial coarse segmentation, we leverage 2D image/video segmentation models and further refine these coarse estimates using our graph construction. Our empirical evaluations show the adaptability of GaussianCut across a diverse set of scenes. GaussianCut achieves competitive performance with state-of-the-art approaches for 3D segmentation without requiring any additional segmentation-aware training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07555v1-abstract-full').style.display = 'none'; document.getElementById('2411.07555v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04989">arXiv:2411.04989</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04989">pdf</a>, <a href="https://arxiv.org/format/2411.04989">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SG-I2V: Self-Guided Trajectory Control in Image-to-Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Namekata%2C+K">Koichi Namekata</a>, <a href="/search/cs?searchtype=author&amp;query=Bahmani%2C+S">Sherwin Bahmani</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Ziyi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Lindell%2C+D+B">David B. Lindell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04989v1-abstract-short" style="display: inline;"> Methods for image-to-video generation have achieved impressive, photo-realistic quality. However, adjusting specific elements in generated videos, such as object motion or camera movement, is often a tedious process of trial and error, e.g., involving re-generating videos with different random seeds. Recent techniques address this issue by fine-tuning a pre-trained model to follow conditioning sig&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04989v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04989v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04989v1-abstract-full" style="display: none;"> Methods for image-to-video generation have achieved impressive, photo-realistic quality. However, adjusting specific elements in generated videos, such as object motion or camera movement, is often a tedious process of trial and error, e.g., involving re-generating videos with different random seeds. Recent techniques address this issue by fine-tuning a pre-trained model to follow conditioning signals, such as bounding boxes or point trajectories. Yet, this fine-tuning procedure can be computationally expensive, and it requires datasets with annotated object motion, which can be difficult to procure. In this work, we introduce SG-I2V, a framework for controllable image-to-video generation that is self-guided$\unicode{x2013}$offering zero-shot control by relying solely on the knowledge present in a pre-trained image-to-video diffusion model without the need for fine-tuning or external knowledge. Our zero-shot method outperforms unsupervised baselines while being competitive with supervised models in terms of visual quality and motion fidelity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04989v1-abstract-full').style.display = 'none'; document.getElementById('2411.04989v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://kmcode1.github.io/Projects/SG-I2V/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08896">arXiv:2410.08896</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08896">pdf</a>, <a href="https://arxiv.org/format/2410.08896">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MAD-TD: Model-Augmented Data stabilizes High Update Ratio RL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Voelcker%2C+C+A">Claas A Voelcker</a>, <a href="/search/cs?searchtype=author&amp;query=Hussing%2C+M">Marcel Hussing</a>, <a href="/search/cs?searchtype=author&amp;query=Eaton%2C+E">Eric Eaton</a>, <a href="/search/cs?searchtype=author&amp;query=Farahmand%2C+A">Amir-massoud Farahmand</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08896v1-abstract-short" style="display: inline;"> Building deep reinforcement learning (RL) agents that find a good policy with few samples has proven notoriously challenging. To achieve sample efficiency, recent work has explored updating neural networks with large numbers of gradient steps for every new sample. While such high update-to-data (UTD) ratios have shown strong empirical performance, they also introduce instability to the training pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08896v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08896v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08896v1-abstract-full" style="display: none;"> Building deep reinforcement learning (RL) agents that find a good policy with few samples has proven notoriously challenging. To achieve sample efficiency, recent work has explored updating neural networks with large numbers of gradient steps for every new sample. While such high update-to-data (UTD) ratios have shown strong empirical performance, they also introduce instability to the training process. Previous approaches need to rely on periodic neural network parameter resets to address this instability, but restarting the training process is infeasible in many real-world applications and requires tuning the resetting interval. In this paper, we focus on one of the core difficulties of stable training with limited samples: the inability of learned value functions to generalize to unobserved on-policy actions. We mitigate this issue directly by augmenting the off-policy RL training process with a small amount of data generated from a learned world model. Our method, Model-Augmented Data for Temporal Difference learning (MAD-TD) uses small amounts of generated data to stabilize high UTD training and achieve competitive performance on the most challenging tasks in the DeepMind control suite. Our experiments further highlight the importance of employing a good model to generate data, MAD-TD&#39;s ability to combat value overestimation, and its practical stability gains for continued learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08896v1-abstract-full').style.display = 'none'; document.getElementById('2410.08896v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04618">arXiv:2410.04618</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04618">pdf</a>, <a href="https://arxiv.org/format/2410.04618">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Unsupervised Blind Face Restoration using Diffusion Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kuai%2C+T">Tianshu Kuai</a>, <a href="/search/cs?searchtype=author&amp;query=Honari%2C+S">Sina Honari</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Levinshtein%2C+A">Alex Levinshtein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04618v3-abstract-short" style="display: inline;"> Blind face restoration methods have shown remarkable performance, particularly when trained on large-scale synthetic datasets with supervised learning. These datasets are often generated by simulating low-quality face images with a handcrafted image degradation pipeline. The models trained on such synthetic degradations, however, cannot deal with inputs of unseen degradations. In this paper, we ad&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04618v3-abstract-full').style.display = 'inline'; document.getElementById('2410.04618v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04618v3-abstract-full" style="display: none;"> Blind face restoration methods have shown remarkable performance, particularly when trained on large-scale synthetic datasets with supervised learning. These datasets are often generated by simulating low-quality face images with a handcrafted image degradation pipeline. The models trained on such synthetic degradations, however, cannot deal with inputs of unseen degradations. In this paper, we address this issue by using only a set of input images, with unknown degradations and without ground truth targets, to fine-tune a restoration model that learns to map them to clean and contextually consistent outputs. We utilize a pre-trained diffusion model as a generative prior through which we generate high quality images from the natural image distribution while maintaining the input image content through consistency constraints. These generated images are then used as pseudo targets to fine-tune a pre-trained restoration model. Unlike many recent approaches that employ diffusion models at test time, we only do so during training and thus maintain an efficient inference-time performance. Extensive experiments show that the proposed approach can consistently improve the perceptual quality of pre-trained blind face restoration models while maintaining great consistency with the input contents. Our best model also achieves the state-of-the-art results on both synthetic and real-world datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04618v3-abstract-full').style.display = 'none'; document.getElementById('2410.04618v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV 2025. Project page: https://dt-bfr.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18314">arXiv:2409.18314</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.18314">pdf</a>, <a href="https://arxiv.org/format/2409.18314">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Realistic Evaluation of Model Merging for Compositional Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tam%2C+D">Derek Tam</a>, <a href="/search/cs?searchtype=author&amp;query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&amp;query=Lester%2C+B">Brian Lester</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Raffel%2C+C">Colin Raffel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18314v1-abstract-short" style="display: inline;"> Merging has become a widespread way to cheaply combine individual models into a single model that inherits their capabilities and attains better performance. This popularity has spurred rapid development of many new merging methods, which are typically validated in disparate experimental settings and frequently differ in the assumptions made about model architecture, data availability, and computa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18314v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18314v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18314v1-abstract-full" style="display: none;"> Merging has become a widespread way to cheaply combine individual models into a single model that inherits their capabilities and attains better performance. This popularity has spurred rapid development of many new merging methods, which are typically validated in disparate experimental settings and frequently differ in the assumptions made about model architecture, data availability, and computational budget. In this work, we characterize the relative merits of different merging methods by evaluating them in a shared experimental setting and precisely identifying the practical requirements of each method. Specifically, our setting focuses on using merging for compositional generalization of capabilities in image classification, image generation, and natural language processing. Additionally, we measure the computational costs of different merging methods as well as how they perform when scaling the number of models being merged. Taken together, our results clarify the state of the field of model merging and provide a comprehensive and rigorous experimental setup to test new methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18314v1-abstract-full').style.display = 'none'; document.getElementById('2409.18314v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06683">arXiv:2407.06683</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.06683">pdf</a>, <a href="https://arxiv.org/format/2407.06683">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Online Mapping and Behavior Prediction via Direct BEV Feature Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gu%2C+X">Xunjiang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+G">Guanyu Song</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Pavone%2C+M">Marco Pavone</a>, <a href="/search/cs?searchtype=author&amp;query=Ivanovic%2C+B">Boris Ivanovic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06683v1-abstract-short" style="display: inline;"> Understanding road geometry is a critical component of the autonomous vehicle (AV) stack. While high-definition (HD) maps can readily provide such information, they suffer from high labeling and maintenance costs. Accordingly, many recent works have proposed methods for estimating HD maps online from sensor data. The vast majority of recent approaches encode multi-camera observations into an inter&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06683v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06683v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06683v1-abstract-full" style="display: none;"> Understanding road geometry is a critical component of the autonomous vehicle (AV) stack. While high-definition (HD) maps can readily provide such information, they suffer from high labeling and maintenance costs. Accordingly, many recent works have proposed methods for estimating HD maps online from sensor data. The vast majority of recent approaches encode multi-camera observations into an intermediate representation, e.g., a bird&#39;s eye view (BEV) grid, and produce vector map elements via a decoder. While this architecture is performant, it decimates much of the information encoded in the intermediate representation, preventing downstream tasks (e.g., behavior prediction) from leveraging them. In this work, we propose exposing the rich internal features of online map estimation methods and show how they enable more tightly integrating online mapping with trajectory forecasting. In doing so, we find that directly accessing internal BEV features yields up to 73% faster inference speeds and up to 29% more accurate predictions on the real-world nuScenes dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06683v1-abstract-full').style.display = 'none'; document.getElementById('2407.06683v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 10 figures, 6 tables. ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17718">arXiv:2406.17718</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.17718">pdf</a>, <a href="https://arxiv.org/format/2406.17718">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> When does Self-Prediction help? Understanding Auxiliary Tasks in Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Voelcker%2C+C">Claas Voelcker</a>, <a href="/search/cs?searchtype=author&amp;query=Kastner%2C+T">Tyler Kastner</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Farahmand%2C+A">Amir-massoud Farahmand</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17718v1-abstract-short" style="display: inline;"> We investigate the impact of auxiliary learning tasks such as observation reconstruction and latent self-prediction on the representation learning problem in reinforcement learning. We also study how they interact with distractions and observation functions in the MDP. We provide a theoretical analysis of the learning dynamics of observation reconstruction, latent self-prediction, and TD learning&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17718v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17718v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17718v1-abstract-full" style="display: none;"> We investigate the impact of auxiliary learning tasks such as observation reconstruction and latent self-prediction on the representation learning problem in reinforcement learning. We also study how they interact with distractions and observation functions in the MDP. We provide a theoretical analysis of the learning dynamics of observation reconstruction, latent self-prediction, and TD learning in the presence of distractions and observation functions under linear model assumptions. With this formalization, we are able to explain why latent-self prediction is a helpful \emph{auxiliary task}, while observation reconstruction can provide more useful features when used in isolation. Our empirical analysis shows that the insights obtained from our learning dynamics framework predicts the behavior of these loss functions beyond the linear model assumption in non-linear neural networks. This reinforces the usefulness of the linear model framework not only for theoretical analysis, but also practical benefit for applied problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17718v1-abstract-full').style.display = 'none'; document.getElementById('2406.17718v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15349">arXiv:2406.15349</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.15349">pdf</a>, <a href="https://arxiv.org/format/2406.15349">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and Benchmarking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dauner%2C+D">Daniel Dauner</a>, <a href="/search/cs?searchtype=author&amp;query=Hallgarten%2C+M">Marcel Hallgarten</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tianyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Weng%2C+X">Xinshuo Weng</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zhiyu Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zetong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hongyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Ivanovic%2C+B">Boris Ivanovic</a>, <a href="/search/cs?searchtype=author&amp;query=Pavone%2C+M">Marco Pavone</a>, <a href="/search/cs?searchtype=author&amp;query=Geiger%2C+A">Andreas Geiger</a>, <a href="/search/cs?searchtype=author&amp;query=Chitta%2C+K">Kashyap Chitta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15349v2-abstract-short" style="display: inline;"> Benchmarking vision-based driving policies is challenging. On one hand, open-loop evaluation with real data is easy, but these results do not reflect closed-loop performance. On the other, closed-loop evaluation is possible in simulation, but is hard to scale due to its significant computational demands. Further, the simulators available today exhibit a large domain gap to real data. This has resu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15349v2-abstract-full').style.display = 'inline'; document.getElementById('2406.15349v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15349v2-abstract-full" style="display: none;"> Benchmarking vision-based driving policies is challenging. On one hand, open-loop evaluation with real data is easy, but these results do not reflect closed-loop performance. On the other, closed-loop evaluation is possible in simulation, but is hard to scale due to its significant computational demands. Further, the simulators available today exhibit a large domain gap to real data. This has resulted in an inability to draw clear conclusions from the rapidly growing body of research on end-to-end autonomous driving. In this paper, we present NAVSIM, a middle ground between these evaluation paradigms, where we use large datasets in combination with a non-reactive simulator to enable large-scale real-world benchmarking. Specifically, we gather simulation-based metrics, such as progress and time to collision, by unrolling bird&#39;s eye view abstractions of the test scenes for a short simulation horizon. Our simulation is non-reactive, i.e., the evaluated policy and environment do not influence each other. As we demonstrate empirically, this decoupling allows open-loop metric computation while being better aligned with closed-loop evaluations than traditional displacement errors. NAVSIM enabled a new competition held at CVPR 2024, where 143 teams submitted 463 entries, resulting in several new insights. On a large set of challenging scenarios, we observe that simple methods with moderate compute requirements such as TransFuser can match recent large-scale end-to-end driving architectures such as UniAD. Our modular framework can potentially be extended with new datasets, data curation strategies, and metrics, and will be continually maintained to host future challenges. Our code is available at https://github.com/autonomousvision/navsim. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15349v2-abstract-full').style.display = 'none'; document.getElementById('2406.15349v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Datasets and Benchmarks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09292">arXiv:2406.09292</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.09292">pdf</a>, <a href="https://arxiv.org/format/2406.09292">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neural Assets: 3D-Aware Multi-Object Scene Synthesis with Image Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Ziyi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Rubanova%2C+Y">Yulia Rubanova</a>, <a href="/search/cs?searchtype=author&amp;query=Kabra%2C+R">Rishabh Kabra</a>, <a href="/search/cs?searchtype=author&amp;query=Hudson%2C+D+A">Drew A. Hudson</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Aytar%2C+Y">Yusuf Aytar</a>, <a href="/search/cs?searchtype=author&amp;query=van+Steenkiste%2C+S">Sjoerd van Steenkiste</a>, <a href="/search/cs?searchtype=author&amp;query=Allen%2C+K+R">Kelsey R. Allen</a>, <a href="/search/cs?searchtype=author&amp;query=Kipf%2C+T">Thomas Kipf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09292v2-abstract-short" style="display: inline;"> We address the problem of multi-object 3D pose control in image diffusion models. Instead of conditioning on a sequence of text tokens, we propose to use a set of per-object representations, Neural Assets, to control the 3D pose of individual objects in a scene. Neural Assets are obtained by pooling visual representations of objects from a reference image, such as a frame in a video, and are train&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09292v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09292v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09292v2-abstract-full" style="display: none;"> We address the problem of multi-object 3D pose control in image diffusion models. Instead of conditioning on a sequence of text tokens, we propose to use a set of per-object representations, Neural Assets, to control the 3D pose of individual objects in a scene. Neural Assets are obtained by pooling visual representations of objects from a reference image, such as a frame in a video, and are trained to reconstruct the respective objects in a different image, e.g., a later frame in the video. Importantly, we encode object visuals from the reference image while conditioning on object poses from the target frame. This enables learning disentangled appearance and pose features. Combining visual and 3D pose representations in a sequence-of-tokens format allows us to keep the text-to-image architecture of existing models, with Neural Assets in place of text tokens. By fine-tuning a pre-trained text-to-image diffusion model with this information, our approach enables fine-grained 3D pose and placement control of individual objects in a scene. We further demonstrate that Neural Assets can be transferred and recomposed across different scenes. Our model achieves state-of-the-art multi-object editing results on both synthetic 3D scene datasets, as well as two real-world video datasets (Objectron, Waymo Open). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09292v2-abstract-full').style.display = 'none'; document.getElementById('2406.09292v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Additional details and video results are available at https://neural-assets-paper.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10765">arXiv:2404.10765</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.10765">pdf</a>, <a href="https://arxiv.org/format/2404.10765">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RefFusion: Reference Adapted Diffusion Models for 3D Scene Inpainting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mirzaei%2C+A">Ashkan Mirzaei</a>, <a href="/search/cs?searchtype=author&amp;query=De+Lutio%2C+R">Riccardo De Lutio</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S+W">Seung Wook Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Acuna%2C+D">David Acuna</a>, <a href="/search/cs?searchtype=author&amp;query=Kelly%2C+J">Jonathan Kelly</a>, <a href="/search/cs?searchtype=author&amp;query=Fidler%2C+S">Sanja Fidler</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Gojcic%2C+Z">Zan Gojcic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10765v1-abstract-short" style="display: inline;"> Neural reconstruction approaches are rapidly emerging as the preferred representation for 3D scenes, but their limited editability is still posing a challenge. In this work, we propose an approach for 3D scene inpainting -- the task of coherently replacing parts of the reconstructed scene with desired content. Scene inpainting is an inherently ill-posed task as there exist many solutions that plau&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10765v1-abstract-full').style.display = 'inline'; document.getElementById('2404.10765v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10765v1-abstract-full" style="display: none;"> Neural reconstruction approaches are rapidly emerging as the preferred representation for 3D scenes, but their limited editability is still posing a challenge. In this work, we propose an approach for 3D scene inpainting -- the task of coherently replacing parts of the reconstructed scene with desired content. Scene inpainting is an inherently ill-posed task as there exist many solutions that plausibly replace the missing content. A good inpainting method should therefore not only enable high-quality synthesis but also a high degree of control. Based on this observation, we focus on enabling explicit control over the inpainted content and leverage a reference image as an efficient means to achieve this goal. Specifically, we introduce RefFusion, a novel 3D inpainting method based on a multi-scale personalization of an image inpainting diffusion model to the given reference view. The personalization effectively adapts the prior distribution to the target scene, resulting in a lower variance of score distillation objective and hence significantly sharper details. Our framework achieves state-of-the-art results for object removal while maintaining high controllability. We further demonstrate the generality of our formulation on other downstream tasks such as object insertion, scene outpainting, and sparse view reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10765v1-abstract-full').style.display = 'none'; document.getElementById('2404.10765v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://reffusion.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.16439">arXiv:2403.16439</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.16439">pdf</a>, <a href="https://arxiv.org/format/2403.16439">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Producing and Leveraging Online Map Uncertainty in Trajectory Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gu%2C+X">Xunjiang Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+G">Guanyu Song</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Pavone%2C+M">Marco Pavone</a>, <a href="/search/cs?searchtype=author&amp;query=Ivanovic%2C+B">Boris Ivanovic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.16439v1-abstract-short" style="display: inline;"> High-definition (HD) maps have played an integral role in the development of modern autonomous vehicle (AV) stacks, albeit with high associated labeling and maintenance costs. As a result, many recent works have proposed methods for estimating HD maps online from sensor data, enabling AVs to operate outside of previously-mapped regions. However, current online map estimation approaches are develop&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16439v1-abstract-full').style.display = 'inline'; document.getElementById('2403.16439v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.16439v1-abstract-full" style="display: none;"> High-definition (HD) maps have played an integral role in the development of modern autonomous vehicle (AV) stacks, albeit with high associated labeling and maintenance costs. As a result, many recent works have proposed methods for estimating HD maps online from sensor data, enabling AVs to operate outside of previously-mapped regions. However, current online map estimation approaches are developed in isolation of their downstream tasks, complicating their integration in AV stacks. In particular, they do not produce uncertainty or confidence estimates. In this work, we extend multiple state-of-the-art online map estimation methods to additionally estimate uncertainty and show how this enables more tightly integrating online mapping with trajectory forecasting. In doing so, we find that incorporating uncertainty yields up to 50% faster training convergence and up to 15% better prediction performance on the real-world nuScenes driving dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16439v1-abstract-full').style.display = 'none'; document.getElementById('2403.16439v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 14 figures, 6 tables. CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12943">arXiv:2403.12943</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.12943">pdf</a>, <a href="https://arxiv.org/format/2403.12943">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Vid2Robot: End-to-end Video-conditioned Policy Learning with Cross-Attention Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jain%2C+V">Vidhi Jain</a>, <a href="/search/cs?searchtype=author&amp;query=Attarian%2C+M">Maria Attarian</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+N+J">Nikhil J Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Wahid%2C+A">Ayzaan Wahid</a>, <a href="/search/cs?searchtype=author&amp;query=Driess%2C+D">Danny Driess</a>, <a href="/search/cs?searchtype=author&amp;query=Vuong%2C+Q">Quan Vuong</a>, <a href="/search/cs?searchtype=author&amp;query=Sanketi%2C+P+R">Pannag R Sanketi</a>, <a href="/search/cs?searchtype=author&amp;query=Sermanet%2C+P">Pierre Sermanet</a>, <a href="/search/cs?searchtype=author&amp;query=Welker%2C+S">Stefan Welker</a>, <a href="/search/cs?searchtype=author&amp;query=Chan%2C+C">Christine Chan</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Bisk%2C+Y">Yonatan Bisk</a>, <a href="/search/cs?searchtype=author&amp;query=Dwibedi%2C+D">Debidatta Dwibedi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12943v2-abstract-short" style="display: inline;"> Large-scale multi-task robotic manipulation systems often rely on text to specify the task. In this work, we explore whether a robot can learn by observing humans. To do so, the robot must understand a person&#39;s intent and perform the inferred task despite differences in the embodiments and environments. We introduce Vid2Robot, an end-to-end video-conditioned policy that takes human videos demonstr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12943v2-abstract-full').style.display = 'inline'; document.getElementById('2403.12943v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12943v2-abstract-full" style="display: none;"> Large-scale multi-task robotic manipulation systems often rely on text to specify the task. In this work, we explore whether a robot can learn by observing humans. To do so, the robot must understand a person&#39;s intent and perform the inferred task despite differences in the embodiments and environments. We introduce Vid2Robot, an end-to-end video-conditioned policy that takes human videos demonstrating manipulation tasks as input and produces robot actions. Our model is trained with a large dataset of prompt video-robot trajectory pairs to learn unified representations of human and robot actions from videos. Vid2Robot uses cross-attention transformer layers between video features and the current robot state to produce the actions and perform the same task as shown in the video. We use auxiliary contrastive losses to align the prompt and robot video representations for better policies. We evaluate Vid2Robot on real-world robots and observe over 20% improvement over BC-Z when using human prompt videos. Further, we also show cross-object motion transfer ability that enables video-conditioned policies to transfer a motion observed on one object in the prompt video to another object in the robot&#39;s own environment. Videos available at https://vid2robot.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12943v2-abstract-full').style.display = 'none'; document.getElementById('2403.12943v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Robotics: Science &amp; Systems (RSS) 2024. https://vid2robot.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05996">arXiv:2403.05996</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.05996">pdf</a>, <a href="https://arxiv.org/format/2403.05996">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dissecting Deep RL with High Update Ratios: Combatting Value Divergence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hussing%2C+M">Marcel Hussing</a>, <a href="/search/cs?searchtype=author&amp;query=Voelcker%2C+C">Claas Voelcker</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Farahmand%2C+A">Amir-massoud Farahmand</a>, <a href="/search/cs?searchtype=author&amp;query=Eaton%2C+E">Eric Eaton</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05996v3-abstract-short" style="display: inline;"> We show that deep reinforcement learning algorithms can retain their ability to learn without resetting network parameters in settings where the number of gradient updates greatly exceeds the number of environment samples by combatting value function divergence. Under large update-to-data ratios, a recent study by Nikishin et al. (2022) suggested the emergence of a primacy bias, in which agents ov&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05996v3-abstract-full').style.display = 'inline'; document.getElementById('2403.05996v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05996v3-abstract-full" style="display: none;"> We show that deep reinforcement learning algorithms can retain their ability to learn without resetting network parameters in settings where the number of gradient updates greatly exceeds the number of environment samples by combatting value function divergence. Under large update-to-data ratios, a recent study by Nikishin et al. (2022) suggested the emergence of a primacy bias, in which agents overfit early interactions and downplay later experience, impairing their ability to learn. In this work, we investigate the phenomena leading to the primacy bias. We inspect the early stages of training that were conjectured to cause the failure to learn and find that one fundamental challenge is a long-standing acquaintance: value function divergence. Overinflated Q-values are found not only on out-of-distribution but also in-distribution data and can be linked to overestimation on unseen action prediction propelled by optimizer momentum. We employ a simple unit-ball normalization that enables learning under large update ratios, show its efficacy on the widely used dm_control suite, and obtain strong performance on the challenging dog tasks, competitive with model-based approaches. Our results question, in parts, the prior explanation for sub-optimal learning due to overfitting early data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05996v3-abstract-full').style.display = 'none'; document.getElementById('2403.05996v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a conference paper at the First Reinforcement Learning Conference (RLC)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05235">arXiv:2402.05235</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.05235">pdf</a>, <a href="https://arxiv.org/format/2402.05235">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SPAD : Spatially Aware Multiview Diffusers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Ziyi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Vasilkovsky%2C+M">Michael Vasilkovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+G">Guocheng Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+J">Jian Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Guler%2C+R+A">Riza Alp Guler</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&amp;query=Tulyakov%2C+S">Sergey Tulyakov</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Siarohin%2C+A">Aliaksandr Siarohin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05235v1-abstract-short" style="display: inline;"> We present SPAD, a novel approach for creating consistent multi-view images from text prompts or single images. To enable multi-view generation, we repurpose a pretrained 2D diffusion model by extending its self-attention layers with cross-view interactions, and fine-tune it on a high quality subset of Objaverse. We find that a naive extension of the self-attention proposed in prior work (e.g. MVD&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05235v1-abstract-full').style.display = 'inline'; document.getElementById('2402.05235v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05235v1-abstract-full" style="display: none;"> We present SPAD, a novel approach for creating consistent multi-view images from text prompts or single images. To enable multi-view generation, we repurpose a pretrained 2D diffusion model by extending its self-attention layers with cross-view interactions, and fine-tune it on a high quality subset of Objaverse. We find that a naive extension of the self-attention proposed in prior work (e.g. MVDream) leads to content copying between views. Therefore, we explicitly constrain the cross-view attention based on epipolar geometry. To further enhance 3D consistency, we utilize Plucker coordinates derived from camera rays and inject them as positional encoding. This enables SPAD to reason over spatial proximity in 3D well. In contrast to recent works that can only generate views at fixed azimuth and elevation, SPAD offers full camera control and achieves state-of-the-art results in novel view synthesis on unseen objects from the Objaverse and Google Scanned Objects datasets. Finally, we demonstrate that text-to-3D generation using SPAD prevents the multi-face Janus issue. See more details at our webpage: https://yashkant.github.io/spad <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05235v1-abstract-full').style.display = 'none'; document.getElementById('2402.05235v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Webpage: https://yashkant.github.io/spad</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00867">arXiv:2402.00867</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.00867">pdf</a>, <a href="https://arxiv.org/format/2402.00867">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AToM: Amortized Text-to-Mesh using 2D Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qian%2C+G">Guocheng Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Junli Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Siarohin%2C+A">Aliaksandr Siarohin</a>, <a href="/search/cs?searchtype=author&amp;query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chaoyang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Vasilkovsky%2C+M">Michael Vasilkovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hsin-Ying Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+Y">Yuwei Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Skorokhodov%2C+I">Ivan Skorokhodov</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+P">Peiye Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+J">Jian Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&amp;query=Aberman%2C+K">Kfir Aberman</a>, <a href="/search/cs?searchtype=author&amp;query=Tulyakov%2C+S">Sergey Tulyakov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00867v1-abstract-short" style="display: inline;"> We introduce Amortized Text-to-Mesh (AToM), a feed-forward text-to-mesh framework optimized across multiple text prompts simultaneously. In contrast to existing text-to-3D methods that often entail time-consuming per-prompt optimization and commonly output representations other than polygonal meshes, AToM directly generates high-quality textured meshes in less than 1 second with around 10 times re&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00867v1-abstract-full').style.display = 'inline'; document.getElementById('2402.00867v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00867v1-abstract-full" style="display: none;"> We introduce Amortized Text-to-Mesh (AToM), a feed-forward text-to-mesh framework optimized across multiple text prompts simultaneously. In contrast to existing text-to-3D methods that often entail time-consuming per-prompt optimization and commonly output representations other than polygonal meshes, AToM directly generates high-quality textured meshes in less than 1 second with around 10 times reduction in the training cost, and generalizes to unseen prompts. Our key idea is a novel triplane-based text-to-mesh architecture with a two-stage amortized optimization strategy that ensures stable training and enables scalability. Through extensive experiments on various prompt benchmarks, AToM significantly outperforms state-of-the-art amortized approaches with over 4 times higher accuracy (in DF415 dataset) and produces more distinguishable and higher-quality 3D outputs. AToM demonstrates strong generalizability, offering finegrained 3D assets for unseen interpolated prompts without further optimization during inference, unlike per-prompt solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00867v1-abstract-full').style.display = 'none'; document.getElementById('2402.00867v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages with appendix and references. Webpage: https://snap-research.github.io/AToM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03864">arXiv:2312.03864</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.03864">pdf</a>, <a href="https://arxiv.org/format/2312.03864">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Geometry Matching for Multi-Embodiment Grasping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Attarian%2C+M">Maria Attarian</a>, <a href="/search/cs?searchtype=author&amp;query=Asif%2C+M+A">Muhammad Adil Asif</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jingzhou Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hari%2C+R">Ruthrash Hari</a>, <a href="/search/cs?searchtype=author&amp;query=Garg%2C+A">Animesh Garg</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Tompson%2C+J">Jonathan Tompson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03864v1-abstract-short" style="display: inline;"> Many existing learning-based grasping approaches concentrate on a single embodiment, provide limited generalization to higher DoF end-effectors and cannot capture a diverse set of grasp modes. We tackle the problem of grasping using multiple embodiments by learning rich geometric representations for both objects and end-effectors using Graph Neural Networks. Our novel method - GeoMatch - applies s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03864v1-abstract-full').style.display = 'inline'; document.getElementById('2312.03864v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03864v1-abstract-full" style="display: none;"> Many existing learning-based grasping approaches concentrate on a single embodiment, provide limited generalization to higher DoF end-effectors and cannot capture a diverse set of grasp modes. We tackle the problem of grasping using multiple embodiments by learning rich geometric representations for both objects and end-effectors using Graph Neural Networks. Our novel method - GeoMatch - applies supervised learning on grasping data from multiple embodiments, learning end-to-end contact point likelihood maps as well as conditional autoregressive predictions of grasps keypoint-by-keypoint. We compare our method against baselines that support multiple embodiments. Our approach performs better across three end-effectors, while also producing diverse grasps. Examples, including real robot demos, can be found at geo-match.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03864v1-abstract-full').style.display = 'none'; document.getElementById('2312.03864v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 7th Annual Conference on Robot Learning, 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.17286">arXiv:2311.17286</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.17286">pdf</a>, <a href="https://arxiv.org/format/2311.17286">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LEOD: Label-Efficient Object Detection for Event Cameras </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Ziyi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Gehrig%2C+M">Mathias Gehrig</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+Q">Qing Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xudong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.17286v2-abstract-short" style="display: inline;"> Object detection with event cameras benefits from the sensor&#39;s low latency and high dynamic range. However, it is costly to fully label event streams for supervised training due to their high temporal resolution. To reduce this cost, we present LEOD, the first method for label-efficient event-based detection. Our approach unifies weakly- and semi-supervised object detection with a self-training me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17286v2-abstract-full').style.display = 'inline'; document.getElementById('2311.17286v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.17286v2-abstract-full" style="display: none;"> Object detection with event cameras benefits from the sensor&#39;s low latency and high dynamic range. However, it is costly to fully label event streams for supervised training due to their high temporal resolution. To reduce this cost, we present LEOD, the first method for label-efficient event-based detection. Our approach unifies weakly- and semi-supervised object detection with a self-training mechanism. We first utilize a detector pre-trained on limited labels to produce pseudo ground truth on unlabeled events. Then, the detector is re-trained with both real and generated labels. Leveraging the temporal consistency of events, we run bi-directional inference and apply tracking-based post-processing to enhance the quality of pseudo labels. To stabilize training against label noise, we further design a soft anchor assignment strategy. We introduce new experimental protocols to evaluate the task of label-efficient event-based detection on Gen1 and 1Mpx datasets. LEOD consistently outperforms supervised baselines across various labeling ratios. For example, on Gen1, it improves mAP by 8.6% and 7.8% for RVT-S trained with 1% and 2% labels. On 1Mpx, RVT-S with 10% labels even surpasses its fully-supervised counterpart using 100% labels. LEOD maintains its effectiveness even when all labeled data are available, reaching new state-of-the-art results. Finally, we show that our method readily scales to improve larger detectors as well. Code is released at https://github.com/Wuziyi616/LEOD <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17286v2-abstract-full').style.display = 'none'; document.getElementById('2311.17286v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024. Code: https://github.com/Wuziyi616/LEOD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.17880">arXiv:2310.17880</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.17880">pdf</a>, <a href="https://arxiv.org/format/2310.17880">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reconstructive Latent-Space Neural Radiance Fields for Efficient 3D Scene Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aumentado-Armstrong%2C+T">Tristan Aumentado-Armstrong</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzaei%2C+A">Ashkan Mirzaei</a>, <a href="/search/cs?searchtype=author&amp;query=Brubaker%2C+M+A">Marcus A. Brubaker</a>, <a href="/search/cs?searchtype=author&amp;query=Kelly%2C+J">Jonathan Kelly</a>, <a href="/search/cs?searchtype=author&amp;query=Levinshtein%2C+A">Alex Levinshtein</a>, <a href="/search/cs?searchtype=author&amp;query=Derpanis%2C+K+G">Konstantinos G. Derpanis</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.17880v1-abstract-short" style="display: inline;"> Neural Radiance Fields (NeRFs) have proven to be powerful 3D representations, capable of high quality novel view synthesis of complex scenes. While NeRFs have been applied to graphics, vision, and robotics, problems with slow rendering speed and characteristic visual artifacts prevent adoption in many use cases. In this work, we investigate combining an autoencoder (AE) with a NeRF, in which laten&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17880v1-abstract-full').style.display = 'inline'; document.getElementById('2310.17880v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.17880v1-abstract-full" style="display: none;"> Neural Radiance Fields (NeRFs) have proven to be powerful 3D representations, capable of high quality novel view synthesis of complex scenes. While NeRFs have been applied to graphics, vision, and robotics, problems with slow rendering speed and characteristic visual artifacts prevent adoption in many use cases. In this work, we investigate combining an autoencoder (AE) with a NeRF, in which latent features (instead of colours) are rendered and then convolutionally decoded. The resulting latent-space NeRF can produce novel views with higher quality than standard colour-space NeRFs, as the AE can correct certain visual artifacts, while rendering over three times faster. Our work is orthogonal to other techniques for improving NeRF efficiency. Further, we can control the tradeoff between efficiency and image quality by shrinking the AE architecture, achieving over 13 times faster rendering with only a small drop in performance. We hope that our approach can form the basis of an efficient, yet high-fidelity, 3D scene representation for downstream tasks, especially when retaining differentiability is useful, as in many robotics scenarios requiring continual learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17880v1-abstract-full').style.display = 'none'; document.getElementById('2310.17880v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.16167">arXiv:2310.16167</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.16167">pdf</a>, <a href="https://arxiv.org/format/2310.16167">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> iNVS: Repurposing Diffusion Inpainters for Novel View Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&amp;query=Siarohin%2C+A">Aliaksandr Siarohin</a>, <a href="/search/cs?searchtype=author&amp;query=Vasilkovsky%2C+M">Michael Vasilkovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Guler%2C+R+A">Riza Alp Guler</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+J">Jian Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Tulyakov%2C+S">Sergey Tulyakov</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.16167v1-abstract-short" style="display: inline;"> We present a method for generating consistent novel views from a single source image. Our approach focuses on maximizing the reuse of visible pixels from the source image. To achieve this, we use a monocular depth estimator that transfers visible pixels from the source view to the target view. Starting from a pre-trained 2D inpainting diffusion model, we train our method on the large-scale Objaver&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16167v1-abstract-full').style.display = 'inline'; document.getElementById('2310.16167v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.16167v1-abstract-full" style="display: none;"> We present a method for generating consistent novel views from a single source image. Our approach focuses on maximizing the reuse of visible pixels from the source image. To achieve this, we use a monocular depth estimator that transfers visible pixels from the source view to the target view. Starting from a pre-trained 2D inpainting diffusion model, we train our method on the large-scale Objaverse dataset to learn 3D object priors. While training we use a novel masking mechanism based on epipolar lines to further improve the quality of our approach. This allows our framework to perform zero-shot novel view synthesis on a variety of objects. We evaluate the zero-shot abilities of our framework on three challenging datasets: Google Scanned Objects, Ray Traced Multiview, and Common Objects in 3D. See our webpage for more details: https://yashkant.github.io/invs/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16167v1-abstract-full').style.display = 'none'; document.getElementById('2310.16167v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SIGGRAPH Asia, 2023 (Conference Papers)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.08947">arXiv:2308.08947</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.08947">pdf</a>, <a href="https://arxiv.org/format/2308.08947">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Watch Your Steps: Local Image and Scene Editing by Text Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mirzaei%2C+A">Ashkan Mirzaei</a>, <a href="/search/cs?searchtype=author&amp;query=Aumentado-Armstrong%2C+T">Tristan Aumentado-Armstrong</a>, <a href="/search/cs?searchtype=author&amp;query=Brubaker%2C+M+A">Marcus A. Brubaker</a>, <a href="/search/cs?searchtype=author&amp;query=Kelly%2C+J">Jonathan Kelly</a>, <a href="/search/cs?searchtype=author&amp;query=Levinshtein%2C+A">Alex Levinshtein</a>, <a href="/search/cs?searchtype=author&amp;query=Derpanis%2C+K+G">Konstantinos G. Derpanis</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.08947v1-abstract-short" style="display: inline;"> Denoising diffusion models have enabled high-quality image generation and editing. We present a method to localize the desired edit region implicit in a text instruction. We leverage InstructPix2Pix (IP2P) and identify the discrepancy between IP2P predictions with and without the instruction. This discrepancy is referred to as the relevance map. The relevance map conveys the importance of changing&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.08947v1-abstract-full').style.display = 'inline'; document.getElementById('2308.08947v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.08947v1-abstract-full" style="display: none;"> Denoising diffusion models have enabled high-quality image generation and editing. We present a method to localize the desired edit region implicit in a text instruction. We leverage InstructPix2Pix (IP2P) and identify the discrepancy between IP2P predictions with and without the instruction. This discrepancy is referred to as the relevance map. The relevance map conveys the importance of changing each pixel to achieve the edits, and is used to to guide the modifications. This guidance ensures that the irrelevant pixels remain unchanged. Relevance maps are further used to enhance the quality of text-guided editing of 3D scenes in the form of neural radiance fields. A field is trained on relevance maps of training views, denoted as the relevance field, defining the 3D region within which modifications should be made. We perform iterative updates on the training views guided by rendered relevance maps from the relevance field. Our method achieves state-of-the-art performance on both image and NeRF editing tasks. Project page: https://ashmrz.github.io/WatchYourSteps/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.08947v1-abstract-full').style.display = 'none'; document.getElementById('2308.08947v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://ashmrz.github.io/WatchYourSteps/</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> European Conference on Computer Vision (ECCV) 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.13924">arXiv:2307.13924</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.13924">pdf</a>, <a href="https://arxiv.org/format/2307.13924">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> trajdata: A Unified Interface to Multiple Human Trajectory Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ivanovic%2C+B">Boris Ivanovic</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+G">Guanyu Song</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Pavone%2C+M">Marco Pavone</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.13924v1-abstract-short" style="display: inline;"> The field of trajectory forecasting has grown significantly in recent years, partially owing to the release of numerous large-scale, real-world human trajectory datasets for autonomous vehicles (AVs) and pedestrian motion tracking. While such datasets have been a boon for the community, they each use custom and unique data formats and APIs, making it cumbersome for researchers to train and evaluat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.13924v1-abstract-full').style.display = 'inline'; document.getElementById('2307.13924v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.13924v1-abstract-full" style="display: none;"> The field of trajectory forecasting has grown significantly in recent years, partially owing to the release of numerous large-scale, real-world human trajectory datasets for autonomous vehicles (AVs) and pedestrian motion tracking. While such datasets have been a boon for the community, they each use custom and unique data formats and APIs, making it cumbersome for researchers to train and evaluate methods across multiple datasets. To remedy this, we present trajdata: a unified interface to multiple human trajectory datasets. At its core, trajdata provides a simple, uniform, and efficient representation and API for trajectory and map data. As a demonstration of its capabilities, in this work we conduct a comprehensive empirical evaluation of existing trajectory datasets, providing users with a rich understanding of the data underpinning much of current pedestrian and AV motion forecasting research, and proposing suggestions for future datasets from these insights. trajdata is permissively licensed (Apache 2.0) and can be accessed online at https://github.com/NVlabs/trajdata <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.13924v1-abstract-full').style.display = 'none'; document.getElementById('2307.13924v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 15 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.17366">arXiv:2306.17366</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.17366">pdf</a>, <a href="https://arxiv.org/format/2306.17366">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> $位$-models: Effective Decision-Aware Reinforcement Learning with Latent Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Voelcker%2C+C+A">Claas A Voelcker</a>, <a href="/search/cs?searchtype=author&amp;query=Ahmadian%2C+A">Arash Ahmadian</a>, <a href="/search/cs?searchtype=author&amp;query=Abachi%2C+R">Romina Abachi</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Farahmand%2C+A">Amir-massoud Farahmand</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.17366v3-abstract-short" style="display: inline;"> The idea of decision-aware model learning, that models should be accurate where it matters for decision-making, has gained prominence in model-based reinforcement learning. While promising theoretical results have been established, the empirical performance of algorithms leveraging a decision-aware loss has been lacking, especially in continuous control problems. In this paper, we present a study&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.17366v3-abstract-full').style.display = 'inline'; document.getElementById('2306.17366v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.17366v3-abstract-full" style="display: none;"> The idea of decision-aware model learning, that models should be accurate where it matters for decision-making, has gained prominence in model-based reinforcement learning. While promising theoretical results have been established, the empirical performance of algorithms leveraging a decision-aware loss has been lacking, especially in continuous control problems. In this paper, we present a study on the necessary components for decision-aware reinforcement learning models and we showcase design choices that enable well-performing algorithms. To this end, we provide a theoretical and empirical investigation into algorithmic ideas in the field. We highlight that empirical design decisions established in the MuZero line of works, most importantly the use of a latent model, are vital to achieving good performance for related algorithms. Furthermore, we show that the MuZero loss function is biased in stochastic environments and establish that this bias has practical consequences. Building on these findings, we present an overview of which decision-aware loss functions are best used in what empirical scenarios, providing actionable insights to practitioners in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.17366v3-abstract-full').style.display = 'none'; document.getElementById('2306.17366v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.06354">arXiv:2306.06354</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.06354">pdf</a>, <a href="https://arxiv.org/format/2306.06354">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EventCLIP: Adapting CLIP for Event-based Object Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Ziyi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xudong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.06354v3-abstract-short" style="display: inline;"> Recent advances in zero-shot and few-shot classification heavily rely on the success of pre-trained vision-language models (VLMs) such as CLIP. Due to a shortage of large-scale datasets, training such models for event camera data remains infeasible. Thus, adapting existing VLMs across modalities to event vision is an important research challenge. In this work, we introduce EventCLIP, a novel appro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06354v3-abstract-full').style.display = 'inline'; document.getElementById('2306.06354v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.06354v3-abstract-full" style="display: none;"> Recent advances in zero-shot and few-shot classification heavily rely on the success of pre-trained vision-language models (VLMs) such as CLIP. Due to a shortage of large-scale datasets, training such models for event camera data remains infeasible. Thus, adapting existing VLMs across modalities to event vision is an important research challenge. In this work, we introduce EventCLIP, a novel approach that utilizes CLIP for zero-shot and few-shot event-based object recognition. We first generalize CLIP&#39;s image encoder to event data by converting raw events to 2D grid-based representations. To further enhance performance, we propose a feature adapter to aggregate temporal information over event frames and refine text embeddings to better align with the visual inputs. We evaluate EventCLIP on N-Caltech, N-Cars, and N-ImageNet datasets, achieving state-of-the-art few-shot performance. When fine-tuned on the entire dataset, our method outperforms all existing event classifiers. Moreover, we explore practical applications of EventCLIP including robust event classification and label-free event recognition, where our approach surpasses previous baselines designed specifically for these tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06354v3-abstract-full').style.display = 'none'; document.getElementById('2306.06354v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Add results on 1) EventCLIP with another VLM FLIP 2) inference speed analysis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14797">arXiv:2305.14797</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.14797">pdf</a>, <a href="https://arxiv.org/format/2305.14797">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Multi-Abstractive Neural Controller: An Efficient Hierarchical Control Architecture for Interactive Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Rosman%2C+G">Guy Rosman</a>, <a href="/search/cs?searchtype=author&amp;query=Karaman%2C+S">Sertac Karaman</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14797v1-abstract-short" style="display: inline;"> As learning-based methods make their way from perception systems to planning/control stacks, robot control systems have started to enjoy the benefits that data-driven methods provide. Because control systems directly affect the motion of the robot, data-driven methods, especially black box approaches, need to be used with caution considering aspects such as stability and interpretability. In this&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14797v1-abstract-full').style.display = 'inline'; document.getElementById('2305.14797v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14797v1-abstract-full" style="display: none;"> As learning-based methods make their way from perception systems to planning/control stacks, robot control systems have started to enjoy the benefits that data-driven methods provide. Because control systems directly affect the motion of the robot, data-driven methods, especially black box approaches, need to be used with caution considering aspects such as stability and interpretability. In this paper, we describe a differentiable and hierarchical control architecture. The proposed representation, called \textit{multi-abstractive neural controller}, uses the input image to control the transitions within a novel discrete behavior planner (referred to as the visual automaton generative network, or \textit{vAGN}). The output of a vAGN controls the parameters of a set of dynamic movement primitives which provides the system controls. We train this neural controller with real-world driving data via behavior cloning and show improved explainability, sample efficiency, and similarity to human driving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14797v1-abstract-full').style.display = 'none'; document.getElementById('2305.14797v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.11281">arXiv:2305.11281</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.11281">pdf</a>, <a href="https://arxiv.org/format/2305.11281">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SlotDiffusion: Object-Centric Generative Modeling with Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Ziyi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J">Jingyu Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+W">Wuyue Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Garg%2C+A">Animesh Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.11281v2-abstract-short" style="display: inline;"> Object-centric learning aims to represent visual data with a set of object entities (a.k.a. slots), providing structured representations that enable systematic generalization. Leveraging advanced architectures like Transformers, recent approaches have made significant progress in unsupervised object discovery. In addition, slot-based representations hold great potential for generative modeling, su&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11281v2-abstract-full').style.display = 'inline'; document.getElementById('2305.11281v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.11281v2-abstract-full" style="display: none;"> Object-centric learning aims to represent visual data with a set of object entities (a.k.a. slots), providing structured representations that enable systematic generalization. Leveraging advanced architectures like Transformers, recent approaches have made significant progress in unsupervised object discovery. In addition, slot-based representations hold great potential for generative modeling, such as controllable image generation and object manipulation in image editing. However, current slot-based methods often produce blurry images and distorted objects, exhibiting poor generative modeling capabilities. In this paper, we focus on improving slot-to-image decoding, a crucial aspect for high-quality visual generation. We introduce SlotDiffusion -- an object-centric Latent Diffusion Model (LDM) designed for both image and video data. Thanks to the powerful modeling capacity of LDMs, SlotDiffusion surpasses previous slot models in unsupervised object segmentation and visual generation across six datasets. Furthermore, our learned object features can be utilized by existing object-centric dynamics models, improving video prediction quality and downstream temporal reasoning tasks. Finally, we demonstrate the scalability of SlotDiffusion to unconstrained real-world datasets such as PASCAL VOC and COCO, when integrated with self-supervised pre-trained image encoders. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11281v2-abstract-full').style.display = 'none'; document.getElementById('2305.11281v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2023 Spotlight. Project page: https://slotdiffusion.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.09677">arXiv:2304.09677</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.09677">pdf</a>, <a href="https://arxiv.org/format/2304.09677">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reference-guided Controllable Inpainting of Neural Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mirzaei%2C+A">Ashkan Mirzaei</a>, <a href="/search/cs?searchtype=author&amp;query=Aumentado-Armstrong%2C+T">Tristan Aumentado-Armstrong</a>, <a href="/search/cs?searchtype=author&amp;query=Brubaker%2C+M+A">Marcus A. Brubaker</a>, <a href="/search/cs?searchtype=author&amp;query=Kelly%2C+J">Jonathan Kelly</a>, <a href="/search/cs?searchtype=author&amp;query=Levinshtein%2C+A">Alex Levinshtein</a>, <a href="/search/cs?searchtype=author&amp;query=Derpanis%2C+K+G">Konstantinos G. Derpanis</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.09677v2-abstract-short" style="display: inline;"> The popularity of Neural Radiance Fields (NeRFs) for view synthesis has led to a desire for NeRF editing tools. Here, we focus on inpainting regions in a view-consistent and controllable manner. In addition to the typical NeRF inputs and masks delineating the unwanted region in each view, we require only a single inpainted view of the scene, i.e., a reference view. We use monocular depth estimator&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.09677v2-abstract-full').style.display = 'inline'; document.getElementById('2304.09677v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.09677v2-abstract-full" style="display: none;"> The popularity of Neural Radiance Fields (NeRFs) for view synthesis has led to a desire for NeRF editing tools. Here, we focus on inpainting regions in a view-consistent and controllable manner. In addition to the typical NeRF inputs and masks delineating the unwanted region in each view, we require only a single inpainted view of the scene, i.e., a reference view. We use monocular depth estimators to back-project the inpainted view to the correct 3D positions. Then, via a novel rendering technique, a bilateral solver can construct view-dependent effects in non-reference views, making the inpainted region appear consistent from any view. For non-reference disoccluded regions, which cannot be supervised by the single reference view, we devise a method based on image inpainters to guide both the geometry and appearance. Our approach shows superior performance to NeRF inpainting baselines, with the additional advantage that a user can control the generated scene via a single inpainted image. Project page: https://ashmrz.github.io/reference-guided-3d <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.09677v2-abstract-full').style.display = 'none'; document.getElementById('2304.09677v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://ashmrz.github.io/reference-guided-3d</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.06937">arXiv:2304.06937</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.06937">pdf</a>, <a href="https://arxiv.org/format/2304.06937">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CAMM: Building Category-Agnostic and Animatable 3D Models from Monocular Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kuai%2C+T">Tianshu Kuai</a>, <a href="/search/cs?searchtype=author&amp;query=Karthikeyan%2C+A">Akash Karthikeyan</a>, <a href="/search/cs?searchtype=author&amp;query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzaei%2C+A">Ashkan Mirzaei</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.06937v1-abstract-short" style="display: inline;"> Animating an object in 3D often requires an articulated structure, e.g. a kinematic chain or skeleton of the manipulated object with proper skinning weights, to obtain smooth movements and surface deformations. However, existing models that allow direct pose manipulations are either limited to specific object categories or built with specialized equipment. To reduce the work needed for creating an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.06937v1-abstract-full').style.display = 'inline'; document.getElementById('2304.06937v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.06937v1-abstract-full" style="display: none;"> Animating an object in 3D often requires an articulated structure, e.g. a kinematic chain or skeleton of the manipulated object with proper skinning weights, to obtain smooth movements and surface deformations. However, existing models that allow direct pose manipulations are either limited to specific object categories or built with specialized equipment. To reduce the work needed for creating animatable 3D models, we propose a novel reconstruction method that learns an animatable kinematic chain for any articulated object. Our method operates on monocular videos without prior knowledge of the object&#39;s shape or underlying structure. Our approach is on par with state-of-the-art 3D surface reconstruction methods on various articulated object categories while enabling direct pose manipulations by re-posing the learned kinematic chain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.06937v1-abstract-full').style.display = 'none'; document.getElementById('2304.06937v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://camm3d.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.09227">arXiv:2302.09227</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.09227">pdf</a>, <a href="https://arxiv.org/format/2302.09227">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Invertible Neural Skinning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&amp;query=Siarohin%2C+A">Aliaksandr Siarohin</a>, <a href="/search/cs?searchtype=author&amp;query=Guler%2C+R+A">Riza Alp Guler</a>, <a href="/search/cs?searchtype=author&amp;query=Chai%2C+M">Menglei Chai</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+J">Jian Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Tulyakov%2C+S">Sergey Tulyakov</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.09227v2-abstract-short" style="display: inline;"> Building animatable and editable models of clothed humans from raw 3D scans and poses is a challenging problem. Existing reposing methods suffer from the limited expressiveness of Linear Blend Skinning (LBS), require costly mesh extraction to generate each new pose, and typically do not preserve surface correspondences across different poses. In this work, we introduce Invertible Neural Skinning (&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.09227v2-abstract-full').style.display = 'inline'; document.getElementById('2302.09227v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.09227v2-abstract-full" style="display: none;"> Building animatable and editable models of clothed humans from raw 3D scans and poses is a challenging problem. Existing reposing methods suffer from the limited expressiveness of Linear Blend Skinning (LBS), require costly mesh extraction to generate each new pose, and typically do not preserve surface correspondences across different poses. In this work, we introduce Invertible Neural Skinning (INS) to address these shortcomings. To maintain correspondences, we propose a Pose-conditioned Invertible Network (PIN) architecture, which extends the LBS process by learning additional pose-varying deformations. Next, we combine PIN with a differentiable LBS module to build an expressive and end-to-end Invertible Neural Skinning (INS) pipeline. We demonstrate the strong performance of our method by outperforming the state-of-the-art reposing techniques on clothed humans and preserving surface correspondences, while being an order of magnitude faster. We also perform an ablation study, which shows the usefulness of our pose-conditioning formulation, and our qualitative results display that INS can rectify artefacts introduced by LBS well. See our webpage for more details: https://yashkant.github.io/invertible-neural-skinning/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.09227v2-abstract-full').style.display = 'none'; document.getElementById('2302.09227v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.06866">arXiv:2301.06866</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.06866">pdf</a>, <a href="https://arxiv.org/format/2301.06866">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Building Scalable Video Understanding Benchmarks through Sports </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Agarwal%2C+A">Aniket Agarwal</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+A">Alex Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Narasimhan%2C+K">Karthik Narasimhan</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Murahari%2C+V">Vishvak Murahari</a>, <a href="/search/cs?searchtype=author&amp;query=Kant%2C+Y">Yash Kant</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.06866v3-abstract-short" style="display: inline;"> Existing benchmarks for evaluating long video understanding falls short on two critical aspects, either lacking in scale or quality of annotations. These limitations arise from the difficulty in collecting dense annotations for long videos, which often require manually labeling each frame. In this work, we introduce an automated Annotation and Video Stream Alignment Pipeline (abbreviated ASAP). We&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.06866v3-abstract-full').style.display = 'inline'; document.getElementById('2301.06866v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.06866v3-abstract-full" style="display: none;"> Existing benchmarks for evaluating long video understanding falls short on two critical aspects, either lacking in scale or quality of annotations. These limitations arise from the difficulty in collecting dense annotations for long videos, which often require manually labeling each frame. In this work, we introduce an automated Annotation and Video Stream Alignment Pipeline (abbreviated ASAP). We demonstrate the generality of ASAP by aligning unlabeled videos of four different sports with corresponding freely available dense web annotations (i.e. commentary). We then leverage ASAP scalability to create LCric, a large-scale long video understanding benchmark, with over 1000 hours of densely annotated long Cricket videos (with an average sample length of ~50 mins) collected at virtually zero annotation cost. We benchmark and analyze state-of-the-art video understanding models on LCric through a large set of compositional multi-choice and regression queries. We establish a human baseline that indicates significant room for new research to explore. Our human studies indicate that ASAP can align videos and annotations with high fidelity, precision, and speed. The dataset along with the code for ASAP and baselines can be accessed here: https://asap-benchmark.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.06866v3-abstract-full').style.display = 'none'; document.getElementById('2301.06866v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.16991">arXiv:2211.16991</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.16991">pdf</a>, <a href="https://arxiv.org/format/2211.16991">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SparsePose: Sparse-View Camera Pose Regression and Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sinha%2C+S">Samarth Sinha</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J+Y">Jason Y. Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tagliasacchi%2C+A">Andrea Tagliasacchi</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Lindell%2C+D+B">David B. Lindell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.16991v1-abstract-short" style="display: inline;"> Camera pose estimation is a key step in standard 3D reconstruction pipelines that operate on a dense set of images of a single object or scene. However, methods for pose estimation often fail when only a few images are available because they rely on the ability to robustly identify and match visual features between image pairs. While these methods can work robustly with dense camera views, capturi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.16991v1-abstract-full').style.display = 'inline'; document.getElementById('2211.16991v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.16991v1-abstract-full" style="display: none;"> Camera pose estimation is a key step in standard 3D reconstruction pipelines that operate on a dense set of images of a single object or scene. However, methods for pose estimation often fail when only a few images are available because they rely on the ability to robustly identify and match visual features between image pairs. While these methods can work robustly with dense camera views, capturing a large set of images can be time-consuming or impractical. We propose SparsePose for recovering accurate camera poses given a sparse set of wide-baseline images (fewer than 10). The method learns to regress initial camera poses and then iteratively refine them after training on a large-scale dataset of objects (Co3D: Common Objects in 3D). SparsePose significantly outperforms conventional and learning-based baselines in recovering accurate camera rotations and translations. We also demonstrate our pipeline for high-fidelity 3D reconstruction using only 5-9 images of an object. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.16991v1-abstract-full').style.display = 'none'; document.getElementById('2211.16991v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.12254">arXiv:2211.12254</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.12254">pdf</a>, <a href="https://arxiv.org/format/2211.12254">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SPIn-NeRF: Multiview Segmentation and Perceptual Inpainting with Neural Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mirzaei%2C+A">Ashkan Mirzaei</a>, <a href="/search/cs?searchtype=author&amp;query=Aumentado-Armstrong%2C+T">Tristan Aumentado-Armstrong</a>, <a href="/search/cs?searchtype=author&amp;query=Derpanis%2C+K+G">Konstantinos G. Derpanis</a>, <a href="/search/cs?searchtype=author&amp;query=Kelly%2C+J">Jonathan Kelly</a>, <a href="/search/cs?searchtype=author&amp;query=Brubaker%2C+M+A">Marcus A. Brubaker</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Levinshtein%2C+A">Alex Levinshtein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.12254v2-abstract-short" style="display: inline;"> Neural Radiance Fields (NeRFs) have emerged as a popular approach for novel view synthesis. While NeRFs are quickly being adapted for a wider set of applications, intuitively editing NeRF scenes is still an open challenge. One important editing task is the removal of unwanted objects from a 3D scene, such that the replaced region is visually plausible and consistent with its context. We refer to t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12254v2-abstract-full').style.display = 'inline'; document.getElementById('2211.12254v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.12254v2-abstract-full" style="display: none;"> Neural Radiance Fields (NeRFs) have emerged as a popular approach for novel view synthesis. While NeRFs are quickly being adapted for a wider set of applications, intuitively editing NeRF scenes is still an open challenge. One important editing task is the removal of unwanted objects from a 3D scene, such that the replaced region is visually plausible and consistent with its context. We refer to this task as 3D inpainting. In 3D, solutions must be both consistent across multiple views and geometrically valid. In this paper, we propose a novel 3D inpainting method that addresses these challenges. Given a small set of posed images and sparse annotations in a single input image, our framework first rapidly obtains a 3D segmentation mask for a target object. Using the mask, a perceptual optimizationbased approach is then introduced that leverages learned 2D image inpainters, distilling their information into 3D space, while ensuring view consistency. We also address the lack of a diverse benchmark for evaluating 3D scene inpainting methods by introducing a dataset comprised of challenging real-world scenes. In particular, our dataset contains views of the same scene with and without a target object, enabling more principled benchmarking of the 3D inpainting task. We first demonstrate the superiority of our approach on multiview segmentation, comparing to NeRFbased methods and 2D segmentation approaches. We then evaluate on the task of 3D inpainting, establishing state-ofthe-art performance against other NeRF manipulation algorithms, as well as a strong 2D image inpainter baseline. Project Page: https://spinnerf3d.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12254v2-abstract-full').style.display = 'none'; document.getElementById('2211.12254v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://spinnerf3d.github.io</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.12566">arXiv:2210.12566</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.12566">pdf</a>, <a href="https://arxiv.org/format/2210.12566">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Solving Continuous Control via Q-learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Seyde%2C+T">Tim Seyde</a>, <a href="/search/cs?searchtype=author&amp;query=Werner%2C+P">Peter Werner</a>, <a href="/search/cs?searchtype=author&amp;query=Schwarting%2C+W">Wilko Schwarting</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.12566v2-abstract-short" style="display: inline;"> While there has been substantial success for solving continuous control with actor-critic methods, simpler critic-only methods such as Q-learning find limited application in the associated high-dimensional action spaces. However, most actor-critic methods come at the cost of added complexity: heuristics for stabilisation, compute requirements and wider hyperparameter search spaces. We show that a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12566v2-abstract-full').style.display = 'inline'; document.getElementById('2210.12566v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.12566v2-abstract-full" style="display: none;"> While there has been substantial success for solving continuous control with actor-critic methods, simpler critic-only methods such as Q-learning find limited application in the associated high-dimensional action spaces. However, most actor-critic methods come at the cost of added complexity: heuristics for stabilisation, compute requirements and wider hyperparameter search spaces. We show that a simple modification of deep Q-learning largely alleviates these issues. By combining bang-bang action discretization with value decomposition, framing single-agent control as cooperative multi-agent reinforcement learning (MARL), this simple critic-only approach matches performance of state-of-the-art continuous actor-critic methods when learning from features or pixels. We extend classical bandit examples from cooperative MARL to provide intuition for how decoupled critics leverage state information to coordinate joint optimization, and demonstrate surprisingly strong performance across a variety of continuous control tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12566v2-abstract-full').style.display = 'none'; document.getElementById('2210.12566v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.03825">arXiv:2210.03825</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.03825">pdf</a>, <a href="https://arxiv.org/format/2210.03825">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> See, Plan, Predict: Language-guided Cognitive Planning with Video Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Attarian%2C+M">Maria Attarian</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+A">Advaya Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Ziyi Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Wei Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Garg%2C+A">Animesh Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.03825v1-abstract-short" style="display: inline;"> Cognitive planning is the structural decomposition of complex tasks into a sequence of future behaviors. In the computational setting, performing cognitive planning entails grounding plans and concepts in one or more modalities in order to leverage them for low level control. Since real-world tasks are often described in natural language, we devise a cognitive planning algorithm via language-guide&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03825v1-abstract-full').style.display = 'inline'; document.getElementById('2210.03825v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.03825v1-abstract-full" style="display: none;"> Cognitive planning is the structural decomposition of complex tasks into a sequence of future behaviors. In the computational setting, performing cognitive planning entails grounding plans and concepts in one or more modalities in order to leverage them for low level control. Since real-world tasks are often described in natural language, we devise a cognitive planning algorithm via language-guided video prediction. Current video prediction models do not support conditioning on natural language instructions. Therefore, we propose a new video prediction architecture which leverages the power of pre-trained transformers.The network is endowed with the ability to ground concepts based on natural language input with generalization to unseen objects. We demonstrate the effectiveness of this approach on a new simulation dataset, where each task is defined by a high-level action described in natural language. Our experiments compare our method again stone video generation baseline without planning or action grounding and showcase significant improvements. Our ablation studies highlight an improved generalization to unseen objects that natural language embeddings offer to concept grounding ability, as well as the importance of planning towards visual &#34;imagination&#34; of a task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03825v1-abstract-full').style.display = 'none'; document.getElementById('2210.03825v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.01583">arXiv:2207.01583</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.01583">pdf</a>, <a href="https://arxiv.org/format/2207.01583">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LaTeRF: Label and Text Driven Object Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mirzaei%2C+A">Ashkan Mirzaei</a>, <a href="/search/cs?searchtype=author&amp;query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&amp;query=Kelly%2C+J">Jonathan Kelly</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.01583v3-abstract-short" style="display: inline;"> Obtaining 3D object representations is important for creating photo-realistic simulations and for collecting AR and VR assets. Neural fields have shown their effectiveness in learning a continuous volumetric representation of a scene from 2D images, but acquiring object representations from these models with weak supervision remains an open challenge. In this paper we introduce LaTeRF, a method fo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.01583v3-abstract-full').style.display = 'inline'; document.getElementById('2207.01583v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.01583v3-abstract-full" style="display: none;"> Obtaining 3D object representations is important for creating photo-realistic simulations and for collecting AR and VR assets. Neural fields have shown their effectiveness in learning a continuous volumetric representation of a scene from 2D images, but acquiring object representations from these models with weak supervision remains an open challenge. In this paper we introduce LaTeRF, a method for extracting an object of interest from a scene given 2D images of the entire scene, known camera poses, a natural language description of the object, and a set of point-labels of object and non-object points in the input images. To faithfully extract the object from the scene, LaTeRF extends the NeRF formulation with an additional `objectness&#39; probability at each 3D point. Additionally, we leverage the rich latent space of a pre-trained CLIP model combined with our differentiable object renderer, to inpaint the occluded parts of the object. We demonstrate high-fidelity object extraction on both synthetic and real-world datasets and justify our design choices through an extensive ablation study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.01583v3-abstract-full').style.display = 'none'; document.getElementById('2207.01583v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> European Conference on Computer Vision (ECCV) 2022 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.10712">arXiv:2205.10712</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.10712">pdf</a>, <a href="https://arxiv.org/format/2205.10712">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Housekeep: Tidying Virtual Households using Commonsense Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&amp;query=Ramachandran%2C+A">Arun Ramachandran</a>, <a href="/search/cs?searchtype=author&amp;query=Yenamandra%2C+S">Sriram Yenamandra</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Batra%2C+D">Dhruv Batra</a>, <a href="/search/cs?searchtype=author&amp;query=Szot%2C+A">Andrew Szot</a>, <a href="/search/cs?searchtype=author&amp;query=Agrawal%2C+H">Harsh Agrawal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.10712v1-abstract-short" style="display: inline;"> We introduce Housekeep, a benchmark to evaluate commonsense reasoning in the home for embodied AI. In Housekeep, an embodied agent must tidy a house by rearranging misplaced objects without explicit instructions specifying which objects need to be rearranged. Instead, the agent must learn from and is evaluated against human preferences of which objects belong where in a tidy house. Specifically, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.10712v1-abstract-full').style.display = 'inline'; document.getElementById('2205.10712v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.10712v1-abstract-full" style="display: none;"> We introduce Housekeep, a benchmark to evaluate commonsense reasoning in the home for embodied AI. In Housekeep, an embodied agent must tidy a house by rearranging misplaced objects without explicit instructions specifying which objects need to be rearranged. Instead, the agent must learn from and is evaluated against human preferences of which objects belong where in a tidy house. Specifically, we collect a dataset of where humans typically place objects in tidy and untidy houses constituting 1799 objects, 268 object categories, 585 placements, and 105 rooms. Next, we propose a modular baseline approach for Housekeep that integrates planning, exploration, and navigation. It leverages a fine-tuned large language model (LLM) trained on an internet text corpus for effective planning. We show that our baseline agent generalizes to rearranging unseen objects in unknown environments. See our webpage for more details: https://yashkant.github.io/housekeep/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.10712v1-abstract-full').style.display = 'none'; document.getElementById('2205.10712v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.09117">arXiv:2205.09117</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.09117">pdf</a>, <a href="https://arxiv.org/format/2205.09117">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Neighborhood Mixup Experience Replay: Local Convex Interpolation for Improved Sample Efficiency in Continuous Control Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sander%2C+R">Ryan Sander</a>, <a href="/search/cs?searchtype=author&amp;query=Schwarting%2C+W">Wilko Schwarting</a>, <a href="/search/cs?searchtype=author&amp;query=Seyde%2C+T">Tim Seyde</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Karaman%2C+S">Sertac Karaman</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.09117v1-abstract-short" style="display: inline;"> Experience replay plays a crucial role in improving the sample efficiency of deep reinforcement learning agents. Recent advances in experience replay propose using Mixup (Zhang et al., 2018) to further improve sample efficiency via synthetic sample generation. We build upon this technique with Neighborhood Mixup Experience Replay (NMER), a geometrically-grounded replay buffer that interpolates tra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.09117v1-abstract-full').style.display = 'inline'; document.getElementById('2205.09117v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.09117v1-abstract-full" style="display: none;"> Experience replay plays a crucial role in improving the sample efficiency of deep reinforcement learning agents. Recent advances in experience replay propose using Mixup (Zhang et al., 2018) to further improve sample efficiency via synthetic sample generation. We build upon this technique with Neighborhood Mixup Experience Replay (NMER), a geometrically-grounded replay buffer that interpolates transitions with their closest neighbors in state-action space. NMER preserves a locally linear approximation of the transition manifold by only applying Mixup between transitions with vicinal state-action features. Under NMER, a given transition&#39;s set of state action neighbors is dynamic and episode agnostic, in turn encouraging greater policy generalizability via inter-episode interpolation. We combine our approach with recent off-policy deep reinforcement learning algorithms and evaluate on continuous control environments. We observe that NMER improves sample efficiency by an average 94% (TD3) and 29% (SAC) over baseline replay buffers, enabling agents to effectively recombine previous experiences and learn from limited data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.09117v1-abstract-full').style.display = 'none'; document.getElementById('2205.09117v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to L4DC 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.12137">arXiv:2111.12137</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.12137">pdf</a>, <a href="https://arxiv.org/format/2111.12137">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning Interactive Driving Policies via Data-driven Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Tsun-Hsuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Amini%2C+A">Alexander Amini</a>, <a href="/search/cs?searchtype=author&amp;query=Schwarting%2C+W">Wilko Schwarting</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Karaman%2C+S">Sertac Karaman</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.12137v1-abstract-short" style="display: inline;"> Data-driven simulators promise high data-efficiency for driving policy learning. When used for modelling interactions, this data-efficiency becomes a bottleneck: Small underlying datasets often lack interesting and challenging edge cases for learning interactive driving. We address this challenge by proposing a simulation method that uses in-painted ado vehicles for learning robust driving policie&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.12137v1-abstract-full').style.display = 'inline'; document.getElementById('2111.12137v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.12137v1-abstract-full" style="display: none;"> Data-driven simulators promise high data-efficiency for driving policy learning. When used for modelling interactions, this data-efficiency becomes a bottleneck: Small underlying datasets often lack interesting and challenging edge cases for learning interactive driving. We address this challenge by proposing a simulation method that uses in-painted ado vehicles for learning robust driving policies. Thus, our approach can be used to learn policies that involve multi-agent interactions and allows for training via state-of-the-art policy learning methods. We evaluate the approach for learning standard interaction scenarios in driving. In extensive experiments, our work demonstrates that the resulting policies can be directly transferred to a full-scale autonomous vehicle without making use of any traditional sim-to-real transfer techniques such as domain randomization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.12137v1-abstract-full').style.display = 'none'; document.getElementById('2111.12137v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contributed equally to this this work. Code is available here: http://vista.csail.mit.edu/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.12083">arXiv:2111.12083</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.12083">pdf</a>, <a href="https://arxiv.org/format/2111.12083">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VISTA 2.0: An Open, Data-driven Simulator for Multimodal Sensing and Policy Learning for Autonomous Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Amini%2C+A">Alexander Amini</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Tsun-Hsuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Schwarting%2C+W">Wilko Schwarting</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhijian Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+S">Song Han</a>, <a href="/search/cs?searchtype=author&amp;query=Karaman%2C+S">Sertac Karaman</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.12083v1-abstract-short" style="display: inline;"> Simulation has the potential to transform the development of robust algorithms for mobile agents deployed in safety-critical scenarios. However, the poor photorealism and lack of diverse sensor modalities of existing simulation engines remain key hurdles towards realizing this potential. Here, we present VISTA, an open source, data-driven simulator that integrates multiple types of sensors for aut&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.12083v1-abstract-full').style.display = 'inline'; document.getElementById('2111.12083v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.12083v1-abstract-full" style="display: none;"> Simulation has the potential to transform the development of robust algorithms for mobile agents deployed in safety-critical scenarios. However, the poor photorealism and lack of diverse sensor modalities of existing simulation engines remain key hurdles towards realizing this potential. Here, we present VISTA, an open source, data-driven simulator that integrates multiple types of sensors for autonomous vehicles. Using high fidelity, real-world datasets, VISTA represents and simulates RGB cameras, 3D LiDAR, and event-based cameras, enabling the rapid generation of novel viewpoints in simulation and thereby enriching the data available for policy learning with corner cases that are difficult to capture in the physical world. Using VISTA, we demonstrate the ability to train and test perception-to-control policies across each of the sensor types and showcase the power of this approach via deployment on a full scale autonomous vehicle. The policies learned in VISTA exhibit sim-to-real transfer without modification and greater robustness than those trained exclusively on real-world data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.12083v1-abstract-full').style.display = 'none'; document.getElementById('2111.12083v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First two authors contributed equally. Code and project website is available here: https://vista.csail.mit.edu</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.02552">arXiv:2111.02552</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.02552">pdf</a>, <a href="https://arxiv.org/format/2111.02552">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Is Bang-Bang Control All You Need? Solving Continuous Control with Bernoulli Policies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Seyde%2C+T">Tim Seyde</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Schwarting%2C+W">Wilko Schwarting</a>, <a href="/search/cs?searchtype=author&amp;query=Stellato%2C+B">Bartolomeo Stellato</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.02552v1-abstract-short" style="display: inline;"> Reinforcement learning (RL) for continuous control typically employs distributions whose support covers the entire action space. In this work, we investigate the colloquially known phenomenon that trained agents often prefer actions at the boundaries of that space. We draw theoretical connections to the emergence of bang-bang behavior in optimal control, and provide extensive empirical evaluation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02552v1-abstract-full').style.display = 'inline'; document.getElementById('2111.02552v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.02552v1-abstract-full" style="display: none;"> Reinforcement learning (RL) for continuous control typically employs distributions whose support covers the entire action space. In this work, we investigate the colloquially known phenomenon that trained agents often prefer actions at the boundaries of that space. We draw theoretical connections to the emergence of bang-bang behavior in optimal control, and provide extensive empirical evaluation across a variety of recent RL algorithms. We replace the normal Gaussian by a Bernoulli distribution that solely considers the extremes along each action dimension - a bang-bang controller. Surprisingly, this achieves state-of-the-art performance on several continuous control benchmarks - in contrast to robotic hardware, where energy and maintenance cost affect controller choices. Since exploration, learning,and the final solution are entangled in RL, we provide additional imitation learning experiments to reduce the impact of exploration on our analysis. Finally, we show that our observations generalize to environments that aim to model real-world challenges and evaluate factors to mitigate the emergence of bang-bang solutions. Our findings emphasize challenges for benchmarking continuous control algorithms, particularly in light of potential real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02552v1-abstract-full').style.display = 'none'; document.getElementById('2111.02552v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.02344">arXiv:2110.02344</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.02344">pdf</a>, <a href="https://arxiv.org/format/2110.02344">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HYPER: Learned Hybrid Trajectory Prediction via Factored Inference and Adaptive Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Rosman%2C+G">Guy Rosman</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Jasour%2C+A">Ashkan Jasour</a>, <a href="/search/cs?searchtype=author&amp;query=McGill%2C+S+G">Stephen G. McGill</a>, <a href="/search/cs?searchtype=author&amp;query=Leonard%2C+J+J">John J. Leonard</a>, <a href="/search/cs?searchtype=author&amp;query=Williams%2C+B+C">Brian C. Williams</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.02344v1-abstract-short" style="display: inline;"> Modeling multi-modal high-level intent is important for ensuring diversity in trajectory prediction. Existing approaches explore the discrete nature of human intent before predicting continuous trajectories, to improve accuracy and support explainability. However, these approaches often assume the intent to remain fixed over the prediction horizon, which is problematic in practice, especially over&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.02344v1-abstract-full').style.display = 'inline'; document.getElementById('2110.02344v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.02344v1-abstract-full" style="display: none;"> Modeling multi-modal high-level intent is important for ensuring diversity in trajectory prediction. Existing approaches explore the discrete nature of human intent before predicting continuous trajectories, to improve accuracy and support explainability. However, these approaches often assume the intent to remain fixed over the prediction horizon, which is problematic in practice, especially over longer horizons. To overcome this limitation, we introduce HYPER, a general and expressive hybrid prediction framework that models evolving human intent. By modeling traffic agents as a hybrid discrete-continuous system, our approach is capable of predicting discrete intent changes over time. We learn the probabilistic hybrid model via a maximum likelihood estimation problem and leverage neural proposal distributions to sample adaptively from the exponentially growing discrete space. The overall approach affords a better trade-off between accuracy and coverage. We train and validate our model on the Argoverse dataset, and demonstrate its effectiveness through comprehensive ablation studies and comparisons with state-of-the-art models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.02344v1-abstract-full').style.display = 'none'; document.getElementById('2110.02344v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.09812">arXiv:2102.09812</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.09812">pdf</a>, <a href="https://arxiv.org/format/2102.09812">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Deep Latent Competition: Learning to Race Using Visual Control Policies in Latent Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Schwarting%2C+W">Wilko Schwarting</a>, <a href="/search/cs?searchtype=author&amp;query=Seyde%2C+T">Tim Seyde</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Liebenwein%2C+L">Lucas Liebenwein</a>, <a href="/search/cs?searchtype=author&amp;query=Sander%2C+R">Ryan Sander</a>, <a href="/search/cs?searchtype=author&amp;query=Karaman%2C+S">Sertac Karaman</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.09812v1-abstract-short" style="display: inline;"> Learning competitive behaviors in multi-agent settings such as racing requires long-term reasoning about potential adversarial interactions. This paper presents Deep Latent Competition (DLC), a novel reinforcement learning algorithm that learns competitive visual control policies through self-play in imagination. The DLC agent imagines multi-agent interaction sequences in the compact latent space&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.09812v1-abstract-full').style.display = 'inline'; document.getElementById('2102.09812v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.09812v1-abstract-full" style="display: none;"> Learning competitive behaviors in multi-agent settings such as racing requires long-term reasoning about potential adversarial interactions. This paper presents Deep Latent Competition (DLC), a novel reinforcement learning algorithm that learns competitive visual control policies through self-play in imagination. The DLC agent imagines multi-agent interaction sequences in the compact latent space of a learned world model that combines a joint transition function with opponent viewpoint prediction. Imagined self-play reduces costly sample generation in the real world, while the latent representation enables planning to scale gracefully with observation dimensionality. We demonstrate the effectiveness of our algorithm in learning competitive behaviors on a novel multi-agent racing benchmark that requires planning from image observations. Code and videos available at https://sites.google.com/view/deep-latent-competition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.09812v1-abstract-full').style.display = 'none'; document.getElementById('2102.09812v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Wilko, Tim, and Igor contributed equally to this work; published in Conference on Robot Learning 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.06785">arXiv:1912.06785</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1912.06785">pdf</a>, <a href="https://arxiv.org/format/1912.06785">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LRA.2020.3004800">10.1109/LRA.2020.3004800 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Deep Context Maps: Agent Trajectory Prediction using Location-specific Latent Maps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Rosman%2C+G">Guy Rosman</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+A">Arjun Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Karaman%2C+S">Sertac Karaman</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.06785v2-abstract-short" style="display: inline;"> In this paper, we propose a novel approach for agent motion prediction in cluttered environments. One of the main challenges in predicting agent motion is accounting for location and context-specific information. Our main contribution is the concept of learning context maps to improve the prediction task. Context maps are a set of location-specific latent maps that are trained alongside the predic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.06785v2-abstract-full').style.display = 'inline'; document.getElementById('1912.06785v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.06785v2-abstract-full" style="display: none;"> In this paper, we propose a novel approach for agent motion prediction in cluttered environments. One of the main challenges in predicting agent motion is accounting for location and context-specific information. Our main contribution is the concept of learning context maps to improve the prediction task. Context maps are a set of location-specific latent maps that are trained alongside the predictor. Thus, the proposed maps are capable of capturing location context beyond visual context cues (e.g. usual average speeds and typical trajectories) or predefined map primitives (such as lanes and stop lines). We pose context map learning as a multi-task training problem and describe our map model and its incorporation into a state-of-the-art trajectory predictor. In extensive experiments, it is shown that use of learned maps can significantly improve predictor accuracy. Furthermore, the performance can be additionally boosted by providing partial knowledge of map semantics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.06785v2-abstract-full').style.display = 'none'; document.getElementById('1912.06785v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.05422">arXiv:1910.05422</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.05422">pdf</a>, <a href="https://arxiv.org/format/1910.05422">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> SiPPing Neural Networks: Sensitivity-informed Provable Pruning of Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Baykal%2C+C">Cenk Baykal</a>, <a href="/search/cs?searchtype=author&amp;query=Liebenwein%2C+L">Lucas Liebenwein</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Feldman%2C+D">Dan Feldman</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.05422v2-abstract-short" style="display: inline;"> We introduce a pruning algorithm that provably sparsifies the parameters of a trained model in a way that approximately preserves the model&#39;s predictive accuracy. Our algorithm uses a small batch of input points to construct a data-informed importance sampling distribution over the network&#39;s parameters, and adaptively mixes a sampling-based and deterministic pruning procedure to discard redundant&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.05422v2-abstract-full').style.display = 'inline'; document.getElementById('1910.05422v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.05422v2-abstract-full" style="display: none;"> We introduce a pruning algorithm that provably sparsifies the parameters of a trained model in a way that approximately preserves the model&#39;s predictive accuracy. Our algorithm uses a small batch of input points to construct a data-informed importance sampling distribution over the network&#39;s parameters, and adaptively mixes a sampling-based and deterministic pruning procedure to discard redundant weights. Our pruning method is simultaneously computationally efficient, provably accurate, and broadly applicable to various network architectures and data distributions. Our empirical comparisons show that our algorithm reliably generates highly compressed networks that incur minimal loss in performance relative to that of the original network. We present experimental results that demonstrate our algorithm&#39;s potential to unearth essential network connections that can be trained successfully in isolation, which may be of independent interest. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.05422v2-abstract-full').style.display = 'none'; document.getElementById('1910.05422v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First two authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1808.02658">arXiv:1808.02658</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1808.02658">pdf</a>, <a href="https://arxiv.org/format/1808.02658">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Map Management for Efficient Long-Term Visual Localization in Outdoor Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=B%C3%BCrki%2C+M">Mathias B眉rki</a>, <a href="/search/cs?searchtype=author&amp;query=Dymczyk%2C+M">Marcin Dymczyk</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Cadena%2C+C">Cesar Cadena</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Nieto%2C+J">Juan Nieto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1808.02658v1-abstract-short" style="display: inline;"> We present a complete map management process for a visual localization system designed for multi-vehicle long- term operations in resource constrained outdoor environments. Outdoor visual localization generates large amounts of data that need to be incorporated into a lifelong visual map in order to allow localization at all times and under all appearance conditions. Processing these large quantit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.02658v1-abstract-full').style.display = 'inline'; document.getElementById('1808.02658v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1808.02658v1-abstract-full" style="display: none;"> We present a complete map management process for a visual localization system designed for multi-vehicle long- term operations in resource constrained outdoor environments. Outdoor visual localization generates large amounts of data that need to be incorporated into a lifelong visual map in order to allow localization at all times and under all appearance conditions. Processing these large quantities of data is non- trivial, as it is subject to limited computational and storage capabilities both on the vehicle and on the mapping backend. We address this problem with a two-fold map update paradigm capable of, either, adding new visual cues to the map, or updating co-observation statistics. The former, in combination with offline map summarization techniques, allows enhancing the appearance coverage of the lifelong map while keeping the map size limited. On the other hand, the latter is able to significantly boost the appearance-based landmark selection for efficient online localization without incurring any additional computational or storage burden. Our evaluation in challenging outdoor conditions shows that our proposed map management process allows building and maintaining maps for precise visual localization over long time spans in a tractable and scalable fashion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.02658v1-abstract-full').style.display = 'none'; document.getElementById('1808.02658v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7p</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1808.02656">arXiv:1808.02656</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1808.02656">pdf</a>, <a href="https://arxiv.org/format/1808.02656">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Appearance-Based Landmark Selection for Efficient Long-Term Visual Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=B%C3%BCrki%2C+M">Mathias B眉rki</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Stumm%2C+E">Elena Stumm</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Nieto%2C+J">Juan Nieto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1808.02656v1-abstract-short" style="display: inline;"> We present an online landmark selection method for distributed long-term visual localization systems in bandwidth-constrained environments. Sharing a common map for online localization provides a fleet of au- tonomous vehicles with the possibility to maintain and access a consistent map source, and therefore reduce redundancy while increasing efficiency. However, connectivity over a mobile network&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.02656v1-abstract-full').style.display = 'inline'; document.getElementById('1808.02656v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1808.02656v1-abstract-full" style="display: none;"> We present an online landmark selection method for distributed long-term visual localization systems in bandwidth-constrained environments. Sharing a common map for online localization provides a fleet of au- tonomous vehicles with the possibility to maintain and access a consistent map source, and therefore reduce redundancy while increasing efficiency. However, connectivity over a mobile network imposes strict bandwidth constraints and thus the need to minimize the amount of exchanged data. The wide range of varying appearance conditions encountered during long-term visual localization offers the potential to reduce data usage by extracting only those visual cues which are relevant at the given time. Motivated by this, we propose an unsupervised method of adaptively selecting landmarks according to how likely these landmarks are to be observable under the prevailing appear- ance condition. The ranking function this selection is based upon exploits landmark co-observability statistics collected in past traversals through the mapped area. Evaluation is per- formed over different outdoor environments, large time-scales and varying appearance conditions, including the extreme tran- sition from day-time to night-time, demonstrating that with our appearance-dependent selection method, we can significantly reduce the amount of landmarks used for localization while maintaining or even improving the localization performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.02656v1-abstract-full').style.display = 'none'; document.getElementById('1808.02656v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1807.04702">arXiv:1807.04702</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1807.04702">pdf</a>, <a href="https://arxiv.org/format/1807.04702">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LandmarkBoost: Efficient Visual Context Classifiers for Robust Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dymczyk%2C+M">Marcin Dymczyk</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Nieto%2C+J">Juan Nieto</a>, <a href="/search/cs?searchtype=author&amp;query=Lynen%2C+S">Simon Lynen</a>, <a href="/search/cs?searchtype=author&amp;query=Zeisl%2C+B">Bernhard Zeisl</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1807.04702v2-abstract-short" style="display: inline;"> The growing popularity of autonomous systems creates a need for reliable and efficient metric pose retrieval algorithms. Currently used approaches tend to rely on nearest neighbor search of binary descriptors to perform the 2D-3D matching and guarantee realtime capabilities on mobile platforms. These methods struggle, however, with the growing size of the map, changes in viewpoint or appearance, a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.04702v2-abstract-full').style.display = 'inline'; document.getElementById('1807.04702v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1807.04702v2-abstract-full" style="display: none;"> The growing popularity of autonomous systems creates a need for reliable and efficient metric pose retrieval algorithms. Currently used approaches tend to rely on nearest neighbor search of binary descriptors to perform the 2D-3D matching and guarantee realtime capabilities on mobile platforms. These methods struggle, however, with the growing size of the map, changes in viewpoint or appearance, and visual aliasing present in the environment. The rigidly defined descriptor patterns only capture a limited neighborhood of the keypoint and completely ignore the overall visual context. We propose LandmarkBoost - an approach that, in contrast to the conventional 2D-3D matching methods, casts the search problem as a landmark classification task. We use a boosted classifier to classify landmark observations and directly obtain correspondences as classifier scores. We also introduce a formulation of visual context that is flexible, efficient to compute, and can capture relationships in the entire image plane. The original binary descriptors are augmented with contextual information and informative features are selected by the boosting framework. Through detailed experiments, we evaluate the retrieval quality and performance of LandmarkBoost, demonstrating that it outperforms common state-of-the-art descriptor matching methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.04702v2-abstract-full').style.display = 'none'; document.getElementById('1807.04702v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 July, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.09270">arXiv:1804.09270</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1804.09270">pdf</a>, <a href="https://arxiv.org/format/1804.09270">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning 3D Segment Descriptors for Place Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cramariuc%2C+A">Andrei Cramariuc</a>, <a href="/search/cs?searchtype=author&amp;query=Dub%C3%A9%2C+R">Renaud Dub茅</a>, <a href="/search/cs?searchtype=author&amp;query=Sommer%2C+H">Hannes Sommer</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.09270v1-abstract-short" style="display: inline;"> In the absence of global positioning information, place recognition is a key capability for enabling localization, mapping and navigation in any environment. Most place recognition methods rely on images, point clouds, or a combination of both. In this work we leverage a segment extraction and matching approach to achieve place recognition in Light Detection and Ranging (LiDAR) based 3D point clou&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.09270v1-abstract-full').style.display = 'inline'; document.getElementById('1804.09270v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.09270v1-abstract-full" style="display: none;"> In the absence of global positioning information, place recognition is a key capability for enabling localization, mapping and navigation in any environment. Most place recognition methods rely on images, point clouds, or a combination of both. In this work we leverage a segment extraction and matching approach to achieve place recognition in Light Detection and Ranging (LiDAR) based 3D point cloud maps. One challenge related to this approach is the recognition of segments despite changes in point of view or occlusion. We propose using a learning based method in order to reach a higher recall accuracy then previously proposed methods. Using Convolutional Neural Networks (CNNs), which are state-of-the-art classifiers, we propose a new approach to segment recognition based on learned descriptors. In this paper we compare the effectiveness of three different structures and training methods for CNNs. We demonstrate through several experiments on real-world data collected in an urban driving scenario that the proposed learning based methods outperform hand-crafted descriptors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.09270v1-abstract-full').style.display = 'none'; document.getElementById('1804.09270v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at IROS 2017 Workshop on Learning for Localization and Mapping</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.05345">arXiv:1804.05345</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1804.05345">pdf</a>, <a href="https://arxiv.org/format/1804.05345">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Data-Dependent Coresets for Compressing Neural Networks with Applications to Generalization Bounds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Baykal%2C+C">Cenk Baykal</a>, <a href="/search/cs?searchtype=author&amp;query=Liebenwein%2C+L">Lucas Liebenwein</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Feldman%2C+D">Dan Feldman</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.05345v6-abstract-short" style="display: inline;"> We present an efficient coresets-based neural network compression algorithm that sparsifies the parameters of a trained fully-connected neural network in a manner that provably approximates the network&#39;s output. Our approach is based on an importance sampling scheme that judiciously defines a sampling distribution over the neural network parameters, and as a result, retains parameters of high impo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.05345v6-abstract-full').style.display = 'inline'; document.getElementById('1804.05345v6-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.05345v6-abstract-full" style="display: none;"> We present an efficient coresets-based neural network compression algorithm that sparsifies the parameters of a trained fully-connected neural network in a manner that provably approximates the network&#39;s output. Our approach is based on an importance sampling scheme that judiciously defines a sampling distribution over the neural network parameters, and as a result, retains parameters of high importance while discarding redundant ones. We leverage a novel, empirical notion of sensitivity and extend traditional coreset constructions to the application of compressing parameters. Our theoretical analysis establishes guarantees on the size and accuracy of the resulting compressed network and gives rise to generalization bounds that may provide new insights into the generalization properties of neural networks. We demonstrate the practical effectiveness of our algorithm on a variety of neural network configurations and real-world data sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.05345v6-abstract-full').style.display = 'none'; document.getElementById('1804.05345v6-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First two authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1802.09043">arXiv:1802.09043</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1802.09043">pdf</a>, <a href="https://arxiv.org/format/1802.09043">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Free LSD: Prior-Free Visual Landing Site Detection for Autonomous Planes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hinzmann%2C+T">Timo Hinzmann</a>, <a href="/search/cs?searchtype=author&amp;query=Stastny%2C+T">Thomas Stastny</a>, <a href="/search/cs?searchtype=author&amp;query=Cadena%2C+C">Cesar Cadena</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1802.09043v1-abstract-short" style="display: inline;"> Full autonomy for fixed-wing unmanned aerial vehicles (UAVs) requires the capability to autonomously detect potential landing sites in unknown and unstructured terrain, allowing for self-governed mission completion or handling of emergency situations. In this work, we propose a perception system addressing this challenge by detecting landing sites based on their texture and geometric shape without&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.09043v1-abstract-full').style.display = 'inline'; document.getElementById('1802.09043v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1802.09043v1-abstract-full" style="display: none;"> Full autonomy for fixed-wing unmanned aerial vehicles (UAVs) requires the capability to autonomously detect potential landing sites in unknown and unstructured terrain, allowing for self-governed mission completion or handling of emergency situations. In this work, we propose a perception system addressing this challenge by detecting landing sites based on their texture and geometric shape without using any prior knowledge about the environment. The proposed method considers hazards within the landing region such as terrain roughness and slope, surrounding obstacles that obscure the landing approach path, and the local wind field that is estimated by the on-board EKF. The latter enables applicability of the proposed method on small-scale autonomous planes without landing gear. A safe approach path is computed based on the UAV dynamics, expected state estimation and actuator uncertainty, and the on-board computed elevation map. The proposed framework has been successfully tested on photo-realistic synthetic datasets and in challenging real-world environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.09043v1-abstract-full').style.display = 'none'; document.getElementById('1802.09043v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in IEEE International Conference on Robotics and Automation (ICRA), 2018, Brisbane and IEEE Robotics and Automation Letters (RA-L), 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1801.07478">arXiv:1801.07478</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1801.07478">pdf</a>, <a href="https://arxiv.org/format/1801.07478">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Why and How to Avoid the Flipped Quaternion Multiplication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sommer%2C+H">Hannes Sommer</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Bloesch%2C+M">Michael Bloesch</a>, <a href="/search/cs?searchtype=author&amp;query=Weiss%2C+S">Stephan Weiss</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Nieto%2C+J">Juan Nieto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1801.07478v2-abstract-short" style="display: inline;"> Over the last decades quaternions have become a crucial and very successful tool for attitude representation in robotics and aerospace. However, there is a major problem that is continuously causing trouble in practice when it comes to exchanging formulas or implementations: there are two quaternion multiplications in common use, Hamilton&#39;s original multiplication and its flipped version, which is&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1801.07478v2-abstract-full').style.display = 'inline'; document.getElementById('1801.07478v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1801.07478v2-abstract-full" style="display: none;"> Over the last decades quaternions have become a crucial and very successful tool for attitude representation in robotics and aerospace. However, there is a major problem that is continuously causing trouble in practice when it comes to exchanging formulas or implementations: there are two quaternion multiplications in common use, Hamilton&#39;s original multiplication and its flipped version, which is often associated with NASA&#39;s Jet Propulsion Laboratory. We believe that this particular issue is completely avoidable and only exists today due to a lack of understanding. This paper explains the underlying problem for the popular passive world to body usage of rotation quaternions, and derives an alternative solution compatible with Hamilton&#39;s multiplication. Furthermore, it argues for entirely discontinuing the flipped multiplication. Additionally, it provides recipes for efficiently detecting relevant conventions and migrating formulas or algorithms between them. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1801.07478v2-abstract-full').style.display = 'none'; document.getElementById('1801.07478v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 1 figure, 2 tables (minor improvements and fixes over v1, smaller page margins)</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Gilitschenski%2C+I&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Gilitschenski%2C+I&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Gilitschenski%2C+I&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10