Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 78 results for author: <span class="mathjax">Kanazawa, A</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Kanazawa%2C+A">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Kanazawa, A"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Kanazawa%2C+A&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Kanazawa, A"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Kanazawa%2C+A&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Kanazawa%2C+A&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Kanazawa%2C+A&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05175">arXiv:2502.05175</a> <span> [<a href="https://arxiv.org/pdf/2502.05175">pdf</a>, <a href="https://arxiv.org/format/2502.05175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Fillerbuster: Multi-View Scene Completion for Casual Captures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weber%2C+E">Ethan Weber</a>, <a href="/search/cs?searchtype=author&query=M%C3%BCller%2C+N">Norman M眉ller</a>, <a href="/search/cs?searchtype=author&query=Kant%2C+Y">Yash Kant</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+V">Vasu Agrawal</a>, <a href="/search/cs?searchtype=author&query=Zollh%C3%B6fer%2C+M">Michael Zollh枚fer</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Richardt%2C+C">Christian Richardt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05175v1-abstract-short" style="display: inline;"> We present Fillerbuster, a method that completes unknown regions of a 3D scene by utilizing a novel large-scale multi-view latent diffusion transformer. Casual captures are often sparse and miss surrounding content behind objects or above the scene. Existing methods are not suitable for handling this challenge as they focus on making the known pixels look good with sparse-view priors, or on creati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05175v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05175v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05175v1-abstract-full" style="display: none;"> We present Fillerbuster, a method that completes unknown regions of a 3D scene by utilizing a novel large-scale multi-view latent diffusion transformer. Casual captures are often sparse and miss surrounding content behind objects or above the scene. Existing methods are not suitable for handling this challenge as they focus on making the known pixels look good with sparse-view priors, or on creating the missing sides of objects from just one or two photos. In reality, we often have hundreds of input frames and want to complete areas that are missing and unobserved from the input frames. Additionally, the images often do not have known camera parameters. Our solution is to train a generative model that can consume a large context of input frames while generating unknown target views and recovering image poses when desired. We show results where we complete partial captures on two existing datasets. We also present an uncalibrated scene completion task where our unified model predicts both poses and creates new content. Our model is the first to predict many images and poses together for scene completion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05175v1-abstract-full').style.display = 'none'; document.getElementById('2502.05175v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page at https://ethanweber.me/fillerbuster/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12387">arXiv:2501.12387</a> <span> [<a href="https://arxiv.org/pdf/2501.12387">pdf</a>, <a href="https://arxiv.org/format/2501.12387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Continuous 3D Perception Model with Persistent State </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianqian Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Holynski%2C+A">Aleksander Holynski</a>, <a href="/search/cs?searchtype=author&query=Efros%2C+A+A">Alexei A. Efros</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12387v1-abstract-short" style="display: inline;"> We present a unified framework capable of solving a broad range of 3D tasks. Our approach features a stateful recurrent model that continuously updates its state representation with each new observation. Given a stream of images, this evolving state can be used to generate metric-scale pointmaps (per-pixel 3D points) for each new input in an online fashion. These pointmaps reside within a common c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12387v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12387v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12387v1-abstract-full" style="display: none;"> We present a unified framework capable of solving a broad range of 3D tasks. Our approach features a stateful recurrent model that continuously updates its state representation with each new observation. Given a stream of images, this evolving state can be used to generate metric-scale pointmaps (per-pixel 3D points) for each new input in an online fashion. These pointmaps reside within a common coordinate system, and can be accumulated into a coherent, dense scene reconstruction that updates as new images arrive. Our model, called CUT3R (Continuous Updating Transformer for 3D Reconstruction), captures rich priors of real-world scenes: not only can it predict accurate pointmaps from image observations, but it can also infer unseen regions of the scene by probing at virtual, unobserved views. Our method is simple yet highly flexible, naturally accepting varying lengths of images that may be either video streams or unordered photo collections, containing both static and dynamic content. We evaluate our method on various 3D/4D tasks and demonstrate competitive or state-of-the-art performance in each. Project Page: https://cut3r.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12387v1-abstract-full').style.display = 'none'; document.getElementById('2501.12387v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05450">arXiv:2501.05450</a> <span> [<a href="https://arxiv.org/pdf/2501.05450">pdf</a>, <a href="https://arxiv.org/format/2501.05450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Decentralized Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=McAllister%2C+D">David McAllister</a>, <a href="/search/cs?searchtype=author&query=Tancik%2C+M">Matthew Tancik</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiaming Song</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05450v2-abstract-short" style="display: inline;"> Large-scale AI model training divides work across thousands of GPUs, then synchronizes gradients across them at each step. This incurs a significant network burden that only centralized, monolithic clusters can support, driving up infrastructure costs and straining power systems. We propose Decentralized Diffusion Models, a scalable framework for distributing diffusion model training across indepe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05450v2-abstract-full').style.display = 'inline'; document.getElementById('2501.05450v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05450v2-abstract-full" style="display: none;"> Large-scale AI model training divides work across thousands of GPUs, then synchronizes gradients across them at each step. This incurs a significant network burden that only centralized, monolithic clusters can support, driving up infrastructure costs and straining power systems. We propose Decentralized Diffusion Models, a scalable framework for distributing diffusion model training across independent clusters or datacenters by eliminating the dependence on a centralized, high-bandwidth networking fabric. Our method trains a set of expert diffusion models over partitions of the dataset, each in full isolation from one another. At inference time, the experts ensemble through a lightweight router. We show that the ensemble collectively optimizes the same objective as a single model trained over the whole dataset. This means we can divide the training burden among a number of "compute islands," lowering infrastructure costs and improving resilience to localized GPU failures. Decentralized diffusion models empower researchers to take advantage of smaller, more cost-effective and more readily available compute like on-demand GPU nodes rather than central integrated systems. We conduct extensive experiments on ImageNet and LAION Aesthetics, showing that decentralized diffusion models FLOP-for-FLOP outperform standard diffusion models. We finally scale our approach to 24 billion parameters, demonstrating that high-quality diffusion models can now be trained with just eight individual GPU nodes in less than a week. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05450v2-abstract-full').style.display = 'none'; document.getElementById('2501.05450v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project webpage: https://decentralizeddiffusion.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17806">arXiv:2412.17806</a> <span> [<a href="https://arxiv.org/pdf/2412.17806">pdf</a>, <a href="https://arxiv.org/format/2412.17806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reconstructing People, Places, and Cameras </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=M%C3%BCller%2C+L">Lea M眉ller</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+H">Hongsuk Choi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Anthony Zhang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+B">Brent Yi</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+J">Jitendra Malik</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17806v1-abstract-short" style="display: inline;"> We present "Humans and Structure from Motion" (HSfM), a method for jointly reconstructing multiple human meshes, scene point clouds, and camera parameters in a metric world coordinate system from a sparse set of uncalibrated multi-view images featuring people. Our approach combines data-driven scene reconstruction with the traditional Structure-from-Motion (SfM) framework to achieve more accurate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17806v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17806v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17806v1-abstract-full" style="display: none;"> We present "Humans and Structure from Motion" (HSfM), a method for jointly reconstructing multiple human meshes, scene point clouds, and camera parameters in a metric world coordinate system from a sparse set of uncalibrated multi-view images featuring people. Our approach combines data-driven scene reconstruction with the traditional Structure-from-Motion (SfM) framework to achieve more accurate scene reconstruction and camera estimation, while simultaneously recovering human meshes. In contrast to existing scene reconstruction and SfM methods that lack metric scale information, our method estimates approximate metric scale by leveraging a human statistical model. Furthermore, it reconstructs multiple human meshes within the same world coordinate system alongside the scene point cloud, effectively capturing spatial relationships among individuals and their positions in the environment. We initialize the reconstruction of humans, scenes, and cameras using robust foundational models and jointly optimize these elements. This joint optimization synergistically improves the accuracy of each component. We compare our method to existing approaches on two challenging benchmarks, EgoHumans and EgoExo4D, demonstrating significant improvements in human localization accuracy within the world coordinate frame (reducing error from 3.51m to 1.04m in EgoHumans and from 2.9m to 0.56m in EgoExo4D). Notably, our results show that incorporating human data into the SfM pipeline improves camera pose estimation (e.g., increasing RRA@15 by 20.3% on EgoHumans). Additionally, qualitative results show that our approach improves overall scene reconstruction quality. Our code is available at: muelea.github.io/hsfm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17806v1-abstract-full').style.display = 'none'; document.getElementById('2412.17806v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: muelea.github.io/hsfm</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04463">arXiv:2412.04463</a> <span> [<a href="https://arxiv.org/pdf/2412.04463">pdf</a>, <a href="https://arxiv.org/format/2412.04463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MegaSaM: Accurate, Fast, and Robust Structure and Motion from Casual Dynamic Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengqi Li</a>, <a href="/search/cs?searchtype=author&query=Tucker%2C+R">Richard Tucker</a>, <a href="/search/cs?searchtype=author&query=Cole%2C+F">Forrester Cole</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianqian Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+L">Linyi Jin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+V">Vickie Ye</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Holynski%2C+A">Aleksander Holynski</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04463v2-abstract-short" style="display: inline;"> We present a system that allows for accurate, fast, and robust estimation of camera parameters and depth maps from casual monocular videos of dynamic scenes. Most conventional structure from motion and monocular SLAM techniques assume input videos that feature predominantly static scenes with large amounts of parallax. Such methods tend to produce erroneous estimates in the absence of these condit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04463v2-abstract-full').style.display = 'inline'; document.getElementById('2412.04463v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04463v2-abstract-full" style="display: none;"> We present a system that allows for accurate, fast, and robust estimation of camera parameters and depth maps from casual monocular videos of dynamic scenes. Most conventional structure from motion and monocular SLAM techniques assume input videos that feature predominantly static scenes with large amounts of parallax. Such methods tend to produce erroneous estimates in the absence of these conditions. Recent neural network-based approaches attempt to overcome these challenges; however, such methods are either computationally expensive or brittle when run on dynamic videos with uncontrolled camera motion or unknown field of view. We demonstrate the surprising effectiveness of a deep visual SLAM framework: with careful modifications to its training and inference schemes, this system can scale to real-world videos of complex dynamic scenes with unconstrained camera paths, including videos with little camera parallax. Extensive experiments on both synthetic and real videos demonstrate that our system is significantly more accurate and robust at camera pose and depth estimation when compared with prior and concurrent work, with faster or comparable running times. See interactive results on our project page: https://mega-sam.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04463v2-abstract-full').style.display = 'none'; document.getElementById('2412.04463v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://mega-sam.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23800">arXiv:2410.23800</a> <span> [<a href="https://arxiv.org/pdf/2410.23800">pdf</a>, <a href="https://arxiv.org/format/2410.23800">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> SOAR: Self-Occluded Avatar Recovery from a Single Video In the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhuoyang Pan</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23800v1-abstract-short" style="display: inline;"> Self-occlusion is common when capturing people in the wild, where the performer do not follow predefined motion scripts. This challenges existing monocular human reconstruction systems that assume full body visibility. We introduce Self-Occluded Avatar Recovery (SOAR), a method for complete human reconstruction from partial observations where parts of the body are entirely unobserved. SOAR leverag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23800v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23800v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23800v1-abstract-full" style="display: none;"> Self-occlusion is common when capturing people in the wild, where the performer do not follow predefined motion scripts. This challenges existing monocular human reconstruction systems that assume full body visibility. We introduce Self-Occluded Avatar Recovery (SOAR), a method for complete human reconstruction from partial observations where parts of the body are entirely unobserved. SOAR leverages structural normal prior and generative diffusion prior to address such an ill-posed reconstruction problem. For structural normal prior, we model human with an reposable surfel model with well-defined and easily readable shapes. For generative diffusion prior, we perform an initial reconstruction and refine it using score distillation. On various benchmarks, we show that SOAR performs favorably than state-of-the-art reconstruction and generation methods, and on-par comparing to concurrent works. Additional video results and code are available at https://soar-avatar.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23800v1-abstract-full').style.display = 'none'; document.getElementById('2410.23800v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16259">arXiv:2410.16259</a> <span> [<a href="https://arxiv.org/pdf/2410.16259">pdf</a>, <a href="https://arxiv.org/format/2410.16259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Agent-to-Sim: Learning Interactive Behavior Models from Casual Longitudinal Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gengshan Yang</a>, <a href="/search/cs?searchtype=author&query=Bajcsy%2C+A">Andrea Bajcsy</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+S">Shunsuke Saito</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16259v1-abstract-short" style="display: inline;"> We present Agent-to-Sim (ATS), a framework for learning interactive behavior models of 3D agents from casual longitudinal video collections. Different from prior works that rely on marker-based tracking and multiview cameras, ATS learns natural behaviors of animal and human agents non-invasively through video observations recorded over a long time-span (e.g., a month) in a single environment. Mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16259v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16259v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16259v1-abstract-full" style="display: none;"> We present Agent-to-Sim (ATS), a framework for learning interactive behavior models of 3D agents from casual longitudinal video collections. Different from prior works that rely on marker-based tracking and multiview cameras, ATS learns natural behaviors of animal and human agents non-invasively through video observations recorded over a long time-span (e.g., a month) in a single environment. Modeling 3D behavior of an agent requires persistent 3D tracking (e.g., knowing which point corresponds to which) over a long time period. To obtain such data, we develop a coarse-to-fine registration method that tracks the agent and the camera over time through a canonical 3D space, resulting in a complete and persistent spacetime 4D representation. We then train a generative model of agent behaviors using paired data of perception and motion of an agent queried from the 4D reconstruction. ATS enables real-to-sim transfer from video recordings of an agent to an interactive behavior simulator. We demonstrate results on pets (e.g., cat, dog, bunny) and human given monocular RGBD videos captured by a smartphone. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16259v1-abstract-full').style.display = 'none'; document.getElementById('2410.16259v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://gengshan-y.github.io/agent2sim-www/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03665">arXiv:2410.03665</a> <span> [<a href="https://arxiv.org/pdf/2410.03665">pdf</a>, <a href="https://arxiv.org/format/2410.03665">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Estimating Body and Hand Motion in an Ego-sensed World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+B">Brent Yi</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+V">Vickie Ye</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+M">Maya Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunqi Li</a>, <a href="/search/cs?searchtype=author&query=M%C3%BCller%2C+L">Lea M眉ller</a>, <a href="/search/cs?searchtype=author&query=Pavlakos%2C+G">Georgios Pavlakos</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yi Ma</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+J">Jitendra Malik</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03665v3-abstract-short" style="display: inline;"> We present EgoAllo, a system for human motion estimation from a head-mounted device. Using only egocentric SLAM poses and images, EgoAllo guides sampling from a conditional diffusion model to estimate 3D body pose, height, and hand parameters that capture a device wearer's actions in the allocentric coordinate frame of the scene. To achieve this, our key insight is in representation: we propose sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03665v3-abstract-full').style.display = 'inline'; document.getElementById('2410.03665v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03665v3-abstract-full" style="display: none;"> We present EgoAllo, a system for human motion estimation from a head-mounted device. Using only egocentric SLAM poses and images, EgoAllo guides sampling from a conditional diffusion model to estimate 3D body pose, height, and hand parameters that capture a device wearer's actions in the allocentric coordinate frame of the scene. To achieve this, our key insight is in representation: we propose spatial and temporal invariance criteria for improving model performance, from which we derive a head motion conditioning parameterization that improves estimation by up to 18%. We also show how the bodies estimated by our system can improve hand estimation: the resulting kinematic and temporal constraints can reduce world-frame errors in single-frame estimates by 40%. Project page: https://egoallo.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03665v3-abstract-full').style.display = 'none'; document.getElementById('2410.03665v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://egoallo.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18121">arXiv:2409.18121</a> <span> [<a href="https://arxiv.org/pdf/2409.18121">pdf</a>, <a href="https://arxiv.org/format/2409.18121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robot See Robot Do: Imitating Articulated Object Manipulation with Monocular 4D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kerr%2C+J">Justin Kerr</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+C+M">Chung Min Kim</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mingxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+B">Brent Yi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianqian Wang</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+K">Ken Goldberg</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18121v1-abstract-short" style="display: inline;"> Humans can learn to manipulate new objects by simply watching others; providing robots with the ability to learn from such demonstrations would enable a natural interface specifying new behaviors. This work develops Robot See Robot Do (RSRD), a method for imitating articulated object manipulation from a single monocular RGB human demonstration given a single static multi-view object scan. We first… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18121v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18121v1-abstract-full" style="display: none;"> Humans can learn to manipulate new objects by simply watching others; providing robots with the ability to learn from such demonstrations would enable a natural interface specifying new behaviors. This work develops Robot See Robot Do (RSRD), a method for imitating articulated object manipulation from a single monocular RGB human demonstration given a single static multi-view object scan. We first propose 4D Differentiable Part Models (4D-DPM), a method for recovering 3D part motion from a monocular video with differentiable rendering. This analysis-by-synthesis approach uses part-centric feature fields in an iterative optimization which enables the use of geometric regularizers to recover 3D motions from only a single video. Given this 4D reconstruction, the robot replicates object trajectories by planning bimanual arm motions that induce the demonstrated object part motion. By representing demonstrations as part-centric trajectories, RSRD focuses on replicating the demonstration's intended behavior while considering the robot's own morphological limits, rather than attempting to reproduce the hand's motion. We evaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part trajectories and RSRD's physical execution performance on 9 objects across 10 trials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of 87% success rate, for a total end-to-end success rate of 60% across 90 trials. Notably, this is accomplished using only feature fields distilled from large pretrained vision models -- without any task-specific training, fine-tuning, dataset collection, or annotation. Project page: https://robot-see-robot-do.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18121v1-abstract-full').style.display = 'none'; document.getElementById('2409.18121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CoRL 2024, Project page: https://robot-see-robot-do.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06765">arXiv:2409.06765</a> <span> [<a href="https://arxiv.org/pdf/2409.06765">pdf</a>, <a href="https://arxiv.org/format/2409.06765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> gsplat: An Open-Source Library for Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+V">Vickie Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruilong Li</a>, <a href="/search/cs?searchtype=author&query=Kerr%2C+J">Justin Kerr</a>, <a href="/search/cs?searchtype=author&query=Turkulainen%2C+M">Matias Turkulainen</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+B">Brent Yi</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhuoyang Pan</a>, <a href="/search/cs?searchtype=author&query=Seiskari%2C+O">Otto Seiskari</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jianbo Ye</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jeffrey Hu</a>, <a href="/search/cs?searchtype=author&query=Tancik%2C+M">Matthew Tancik</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06765v1-abstract-short" style="display: inline;"> gsplat is an open-source library designed for training and developing Gaussian Splatting methods. It features a front-end with Python bindings compatible with the PyTorch library and a back-end with highly optimized CUDA kernels. gsplat offers numerous features that enhance the optimization of Gaussian Splatting models, which include optimization improvements for speed, memory, and convergence tim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06765v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06765v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06765v1-abstract-full" style="display: none;"> gsplat is an open-source library designed for training and developing Gaussian Splatting methods. It features a front-end with Python bindings compatible with the PyTorch library and a back-end with highly optimized CUDA kernels. gsplat offers numerous features that enhance the optimization of Gaussian Splatting models, which include optimization improvements for speed, memory, and convergence times. Experimental results demonstrate that gsplat achieves up to 10% less training time and 4x less memory than the original implementation. Utilized in several research projects, gsplat is actively maintained on GitHub. Source code is available at https://github.com/nerfstudio-project/gsplat under Apache License 2.0. We welcome contributions from the open-source community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06765v1-abstract-full').style.display = 'none'; document.getElementById('2409.06765v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 2 figures, JMLR MLOSS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04440">arXiv:2409.04440</a> <span> [<a href="https://arxiv.org/pdf/2409.04440">pdf</a>, <a href="https://arxiv.org/format/2409.04440">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Synergy and Synchrony in Couple Dances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Maluleke%2C+V">Vongani Maluleke</a>, <a href="/search/cs?searchtype=author&query=M%C3%BCller%2C+L">Lea M眉ller</a>, <a href="/search/cs?searchtype=author&query=Rajasegaran%2C+J">Jathushan Rajasegaran</a>, <a href="/search/cs?searchtype=author&query=Pavlakos%2C+G">Georgios Pavlakos</a>, <a href="/search/cs?searchtype=author&query=Ginosar%2C+S">Shiry Ginosar</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+J">Jitendra Malik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04440v1-abstract-short" style="display: inline;"> This paper asks to what extent social interaction influences one's behavior. We study this in the setting of two dancers dancing as a couple. We first consider a baseline in which we predict a dancer's future moves conditioned only on their past motion without regard to their partner. We then investigate the advantage of taking social information into account by conditioning also on the motion of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04440v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04440v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04440v1-abstract-full" style="display: none;"> This paper asks to what extent social interaction influences one's behavior. We study this in the setting of two dancers dancing as a couple. We first consider a baseline in which we predict a dancer's future moves conditioned only on their past motion without regard to their partner. We then investigate the advantage of taking social information into account by conditioning also on the motion of their dancing partner. We focus our analysis on Swing, a dance genre with tight physical coupling for which we present an in-the-wild video dataset. We demonstrate that single-person future motion prediction in this context is challenging. Instead, we observe that prediction greatly benefits from considering the interaction partners' behavior, resulting in surprisingly compelling couple dance synthesis results (see supp. video). Our contributions are a demonstration of the advantages of socially conditioned future motion prediction and an in-the-wild, couple dance video dataset to enable future research in this direction. Video results are available on the project website: https://von31.github.io/synNsync <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04440v1-abstract-full').style.display = 'none'; document.getElementById('2409.04440v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13764">arXiv:2407.13764</a> <span> [<a href="https://arxiv.org/pdf/2407.13764">pdf</a>, <a href="https://arxiv.org/format/2407.13764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Shape of Motion: 4D Reconstruction from a Single Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianqian Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+V">Vickie Ye</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hang Gao</a>, <a href="/search/cs?searchtype=author&query=Austin%2C+J">Jake Austin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengqi Li</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13764v1-abstract-short" style="display: inline;"> Monocular dynamic reconstruction is a challenging and long-standing vision problem due to the highly ill-posed nature of the task. Existing approaches are limited in that they either depend on templates, are effective only in quasi-static scenes, or fail to model 3D motion explicitly. In this work, we introduce a method capable of reconstructing generic dynamic scenes, featuring explicit, full-seq… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13764v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13764v1-abstract-full" style="display: none;"> Monocular dynamic reconstruction is a challenging and long-standing vision problem due to the highly ill-posed nature of the task. Existing approaches are limited in that they either depend on templates, are effective only in quasi-static scenes, or fail to model 3D motion explicitly. In this work, we introduce a method capable of reconstructing generic dynamic scenes, featuring explicit, full-sequence-long 3D motion, from casually captured monocular videos. We tackle the under-constrained nature of the problem with two key insights: First, we exploit the low-dimensional structure of 3D motion by representing scene motion with a compact set of SE3 motion bases. Each point's motion is expressed as a linear combination of these bases, facilitating soft decomposition of the scene into multiple rigidly-moving groups. Second, we utilize a comprehensive set of data-driven priors, including monocular depth maps and long-range 2D tracks, and devise a method to effectively consolidate these noisy supervisory signals, resulting in a globally consistent representation of the dynamic scene. Experiments show that our method achieves state-of-the-art performance for both long-range 3D/2D motion estimation and novel view synthesis on dynamic scenes. Project Page: https://shape-of-motion.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13764v1-abstract-full').style.display = 'none'; document.getElementById('2407.13764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12306">arXiv:2407.12306</a> <span> [<a href="https://arxiv.org/pdf/2407.12306">pdf</a>, <a href="https://arxiv.org/format/2407.12306">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Splatfacto-W: A Nerfstudio Implementation of Gaussian Splatting for Unconstrained Photo Collections </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+C">Congrong Xu</a>, <a href="/search/cs?searchtype=author&query=Kerr%2C+J">Justin Kerr</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12306v2-abstract-short" style="display: inline;"> Novel view synthesis from unconstrained in-the-wild image collections remains a significant yet challenging task due to photometric variations and transient occluders that complicate accurate scene reconstruction. Previous methods have approached these issues by integrating per-image appearance features embeddings in Neural Radiance Fields (NeRFs). Although 3D Gaussian Splatting (3DGS) offers fast… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12306v2-abstract-full').style.display = 'inline'; document.getElementById('2407.12306v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12306v2-abstract-full" style="display: none;"> Novel view synthesis from unconstrained in-the-wild image collections remains a significant yet challenging task due to photometric variations and transient occluders that complicate accurate scene reconstruction. Previous methods have approached these issues by integrating per-image appearance features embeddings in Neural Radiance Fields (NeRFs). Although 3D Gaussian Splatting (3DGS) offers faster training and real-time rendering, adapting it for unconstrained image collections is non-trivial due to the substantially different architecture. In this paper, we introduce Splatfacto-W, an approach that integrates per-Gaussian neural color features and per-image appearance embeddings into the rasterization process, along with a spherical harmonics-based background model to represent varying photometric appearances and better depict backgrounds. Our key contributions include latent appearance modeling, efficient transient object handling, and precise background modeling. Splatfacto-W delivers high-quality, real-time novel view synthesis with improved scene consistency in in-the-wild scenarios. Our method improves the Peak Signal-to-Noise Ratio (PSNR) by an average of 5.3 dB compared to 3DGS, enhances training speed by 150 times compared to NeRF-based methods, and achieves a similar rendering speed to 3DGS. Additional video results and code integrated into Nerfstudio are available at https://kevinxu02.github.io/splatfactow/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12306v2-abstract-full').style.display = 'none'; document.getElementById('2407.12306v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09417">arXiv:2406.09417</a> <span> [<a href="https://arxiv.org/pdf/2406.09417">pdf</a>, <a href="https://arxiv.org/format/2406.09417">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Score Distillation as a Bridge Between Image Distributions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=McAllister%2C+D">David McAllister</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+S">Songwei Ge</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jia-Bin Huang</a>, <a href="/search/cs?searchtype=author&query=Jacobs%2C+D+W">David W. Jacobs</a>, <a href="/search/cs?searchtype=author&query=Efros%2C+A+A">Alexei A. Efros</a>, <a href="/search/cs?searchtype=author&query=Holynski%2C+A">Aleksander Holynski</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09417v2-abstract-short" style="display: inline;"> Score distillation sampling (SDS) has proven to be an important tool, enabling the use of large-scale diffusion priors for tasks operating in data-poor domains. Unfortunately, SDS has a number of characteristic artifacts that limit its usefulness in general-purpose applications. In this paper, we make progress toward understanding the behavior of SDS and its variants by viewing them as solving an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09417v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09417v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09417v2-abstract-full" style="display: none;"> Score distillation sampling (SDS) has proven to be an important tool, enabling the use of large-scale diffusion priors for tasks operating in data-poor domains. Unfortunately, SDS has a number of characteristic artifacts that limit its usefulness in general-purpose applications. In this paper, we make progress toward understanding the behavior of SDS and its variants by viewing them as solving an optimal-cost transport path from a source distribution to a target distribution. Under this new interpretation, these methods seek to transport corrupted images (source) to the natural image distribution (target). We argue that current methods' characteristic artifacts are caused by (1) linear approximation of the optimal path and (2) poor estimates of the source distribution. We show that calibrating the text conditioning of the source distribution can produce high-quality generation and translation results with little extra overhead. Our method can be easily applied across many domains, matching or beating the performance of specialized methods. We demonstrate its utility in text-to-2D, text-based NeRF optimization, translating paintings to real images, optical illusion generation, and 3D sketch-to-real. We compare our method to existing approaches for score distillation sampling and show that it can produce high-frequency details with realistic colors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09417v2-abstract-full').style.display = 'none'; document.getElementById('2406.09417v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024. Project webpage: https://sds-bridge.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10320">arXiv:2405.10320</a> <span> [<a href="https://arxiv.org/pdf/2405.10320">pdf</a>, <a href="https://arxiv.org/format/2405.10320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Toon3D: Seeing Cartoons from New Perspectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weber%2C+E">Ethan Weber</a>, <a href="/search/cs?searchtype=author&query=Peterlinz%2C+R">Riley Peterlinz</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+R">Rohan Mathur</a>, <a href="/search/cs?searchtype=author&query=Warburg%2C+F">Frederik Warburg</a>, <a href="/search/cs?searchtype=author&query=Efros%2C+A+A">Alexei A. Efros</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10320v3-abstract-short" style="display: inline;"> We recover the underlying 3D structure from images of cartoons and anime depicting the same scene. This is an interesting problem domain because images in creative media are often depicted without explicit geometric consistency for storytelling and creative expression-they are only 3D in a qualitative sense. While humans can easily perceive the underlying 3D scene from these images, existing Struc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10320v3-abstract-full').style.display = 'inline'; document.getElementById('2405.10320v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10320v3-abstract-full" style="display: none;"> We recover the underlying 3D structure from images of cartoons and anime depicting the same scene. This is an interesting problem domain because images in creative media are often depicted without explicit geometric consistency for storytelling and creative expression-they are only 3D in a qualitative sense. While humans can easily perceive the underlying 3D scene from these images, existing Structure-from-Motion (SfM) methods that assume 3D consistency fail catastrophically. We present Toon3D for reconstructing geometrically inconsistent images. Our key insight is to deform the input images while recovering camera poses and scene geometry, effectively explaining away geometrical inconsistencies to achieve consistency. This process is guided by the structure inferred from monocular depth predictions. We curate a dataset with multi-view imagery from cartoons and anime that we annotate with reliable sparse correspondences using our user-friendly annotation tool. Our recovered point clouds can be plugged into novel-view synthesis methods to experience cartoons from viewpoints never drawn before. We evaluate against classical and recent learning-based SfM methods, where Toon3D is able to obtain more reliable camera poses and scene geometry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10320v3-abstract-full').style.display = 'none'; document.getElementById('2405.10320v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Please see our project page: https://toon3d.studio</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05530">arXiv:2405.05530</a> <span> [<a href="https://arxiv.org/pdf/2405.05530">pdf</a>, <a href="https://arxiv.org/format/2405.05530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NurtureNet: A Multi-task Video-based Approach for Newborn Anthropometry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khandelwal%2C+Y">Yash Khandelwal</a>, <a href="/search/cs?searchtype=author&query=Arvind%2C+M">Mayur Arvind</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+S">Sriram Kumar</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Ashish Gupta</a>, <a href="/search/cs?searchtype=author&query=Danisetty%2C+S+K">Sachin Kumar Danisetty</a>, <a href="/search/cs?searchtype=author&query=Bagad%2C+P">Piyush Bagad</a>, <a href="/search/cs?searchtype=author&query=Madan%2C+A">Anish Madan</a>, <a href="/search/cs?searchtype=author&query=Lunayach%2C+M">Mayank Lunayach</a>, <a href="/search/cs?searchtype=author&query=Annavajjala%2C+A">Aditya Annavajjala</a>, <a href="/search/cs?searchtype=author&query=Maiti%2C+A">Abhishek Maiti</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+S">Sansiddh Jain</a>, <a href="/search/cs?searchtype=author&query=Dalmia%2C+A">Aman Dalmia</a>, <a href="/search/cs?searchtype=author&query=Deka%2C+N">Namrata Deka</a>, <a href="/search/cs?searchtype=author&query=White%2C+J">Jerome White</a>, <a href="/search/cs?searchtype=author&query=Doshi%2C+J">Jigar Doshi</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Panicker%2C+R">Rahul Panicker</a>, <a href="/search/cs?searchtype=author&query=Raval%2C+A">Alpan Raval</a>, <a href="/search/cs?searchtype=author&query=Rana%2C+S">Srinivas Rana</a>, <a href="/search/cs?searchtype=author&query=Tapaswi%2C+M">Makarand Tapaswi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05530v1-abstract-short" style="display: inline;"> Malnutrition among newborns is a top public health concern in developing countries. Identification and subsequent growth monitoring are key to successful interventions. However, this is challenging in rural communities where health systems tend to be inaccessible and under-equipped, with poor adherence to protocol. Our goal is to equip health workers and public health systems with a solution for c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05530v1-abstract-full').style.display = 'inline'; document.getElementById('2405.05530v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05530v1-abstract-full" style="display: none;"> Malnutrition among newborns is a top public health concern in developing countries. Identification and subsequent growth monitoring are key to successful interventions. However, this is challenging in rural communities where health systems tend to be inaccessible and under-equipped, with poor adherence to protocol. Our goal is to equip health workers and public health systems with a solution for contactless newborn anthropometry in the community. We propose NurtureNet, a multi-task model that fuses visual information (a video taken with a low-cost smartphone) with tabular inputs to regress multiple anthropometry estimates including weight, length, head circumference, and chest circumference. We show that visual proxy tasks of segmentation and keypoint prediction further improve performance. We establish the efficacy of the model through several experiments and achieve a relative error of 3.9% and mean absolute error of 114.3 g for weight estimation. Model compression to 15 MB also allows offline deployment to low-cost smartphones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05530v1-abstract-full').style.display = 'none'; document.getElementById('2405.05530v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CVPM Workshop at CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16221">arXiv:2404.16221</a> <span> [<a href="https://arxiv.org/pdf/2404.16221">pdf</a>, <a href="https://arxiv.org/format/2404.16221">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> NeRF-XL: Scaling NeRFs with Multiple GPUs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruilong Li</a>, <a href="/search/cs?searchtype=author&query=Fidler%2C+S">Sanja Fidler</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Williams%2C+F">Francis Williams</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16221v1-abstract-short" style="display: inline;"> We present NeRF-XL, a principled method for distributing Neural Radiance Fields (NeRFs) across multiple GPUs, thus enabling the training and rendering of NeRFs with an arbitrarily large capacity. We begin by revisiting existing multi-GPU approaches, which decompose large scenes into multiple independently trained NeRFs, and identify several fundamental issues with these methods that hinder improve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16221v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16221v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16221v1-abstract-full" style="display: none;"> We present NeRF-XL, a principled method for distributing Neural Radiance Fields (NeRFs) across multiple GPUs, thus enabling the training and rendering of NeRFs with an arbitrarily large capacity. We begin by revisiting existing multi-GPU approaches, which decompose large scenes into multiple independently trained NeRFs, and identify several fundamental issues with these methods that hinder improvements in reconstruction quality as additional computational resources (GPUs) are used in training. NeRF-XL remedies these issues and enables the training and rendering of NeRFs with an arbitrary number of parameters by simply using more hardware. At the core of our method lies a novel distributed training and rendering formulation, which is mathematically equivalent to the classic single-GPU case and minimizes communication between GPUs. By unlocking NeRFs with arbitrarily large parameter counts, our approach is the first to reveal multi-GPU scaling laws for NeRFs, showing improvements in reconstruction quality with larger parameter counts and speed improvements with more GPUs. We demonstrate the effectiveness of NeRF-XL on a wide variety of datasets, including the largest open-source dataset to date, MatrixCity, containing 258K images covering a 25km^2 city area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16221v1-abstract-full').style.display = 'none'; document.getElementById('2404.16221v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Webpage: https://research.nvidia.com/labs/toronto-ai/nerfxl/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.05072">arXiv:2404.05072</a> <span> [<a href="https://arxiv.org/pdf/2404.05072">pdf</a>, <a href="https://arxiv.org/format/2404.05072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spatial Cognition from Egocentric Video: Out of Sight, Not Out of Mind </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Plizzari%2C+C">Chiara Plizzari</a>, <a href="/search/cs?searchtype=author&query=Goel%2C+S">Shubham Goel</a>, <a href="/search/cs?searchtype=author&query=Perrett%2C+T">Toby Perrett</a>, <a href="/search/cs?searchtype=author&query=Chalk%2C+J">Jacob Chalk</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Damen%2C+D">Dima Damen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.05072v2-abstract-short" style="display: inline;"> As humans move around, performing their daily tasks, they are able to recall where they have positioned objects in their environment, even if these objects are currently out of their sight. In this paper, we aim to mimic this spatial cognition ability. We thus formulate the task of Out of Sight, Not Out of Mind - 3D tracking active objects using observations captured through an egocentric camera.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05072v2-abstract-full').style.display = 'inline'; document.getElementById('2404.05072v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.05072v2-abstract-full" style="display: none;"> As humans move around, performing their daily tasks, they are able to recall where they have positioned objects in their environment, even if these objects are currently out of their sight. In this paper, we aim to mimic this spatial cognition ability. We thus formulate the task of Out of Sight, Not Out of Mind - 3D tracking active objects using observations captured through an egocentric camera. We introduce a simple but effective approach to address this challenging problem, called Lift, Match, and Keep (LMK). LMK lifts partial 2D observations to 3D world coordinates, matches them over time using visual appearance, 3D location and interactions to form object tracks, and keeps these object tracks even when they go out-of-view of the camera. We benchmark LMK on 100 long videos from EPIC-KITCHENS. Our results demonstrate that spatial cognition is critical for correctly locating objects over short and long time scales. E.g., for one long egocentric video, we estimate the 3D location of 50 active objects. After 120 seconds, 57% of the objects are correctly localised by LMK, compared to just 33% by a recent 3D method for egocentric videos and 17% by a general 2D tracking method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05072v2-abstract-full').style.display = 'none'; document.getElementById('2404.05072v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at 3DV 2025. 14 pages including references and appendix. Project Webpage: http://dimadamen.github.io/OSNOM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03652">arXiv:2404.03652</a> <span> [<a href="https://arxiv.org/pdf/2404.03652">pdf</a>, <a href="https://arxiv.org/format/2404.03652">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The More You See in 2D, the More You Perceive in 3D </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xinyang Han</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zelin Gao</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Goel%2C+S">Shubham Goel</a>, <a href="/search/cs?searchtype=author&query=Gandelsman%2C+Y">Yossi Gandelsman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03652v1-abstract-short" style="display: inline;"> Humans can infer 3D structure from 2D images of an object based on past experience and improve their 3D understanding as they see more images. Inspired by this behavior, we introduce SAP3D, a system for 3D reconstruction and novel view synthesis from an arbitrary number of unposed images. Given a few unposed images of an object, we adapt a pre-trained view-conditioned diffusion model together with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03652v1-abstract-full').style.display = 'inline'; document.getElementById('2404.03652v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03652v1-abstract-full" style="display: none;"> Humans can infer 3D structure from 2D images of an object based on past experience and improve their 3D understanding as they see more images. Inspired by this behavior, we introduce SAP3D, a system for 3D reconstruction and novel view synthesis from an arbitrary number of unposed images. Given a few unposed images of an object, we adapt a pre-trained view-conditioned diffusion model together with the camera poses of the images via test-time fine-tuning. The adapted diffusion model and the obtained camera poses are then utilized as instance-specific priors for 3D reconstruction and novel view synthesis. We show that as the number of input images increases, the performance of our approach improves, bridging the gap between optimization-based prior-less 3D reconstruction methods and single-image-to-3D diffusion-based methods. We demonstrate our system on real images as well as standard synthetic benchmarks. Our ablation studies confirm that this adaption behavior is key for more accurate 3D understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03652v1-abstract-full').style.display = 'none'; document.getElementById('2404.03652v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://sap3d.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.09419">arXiv:2401.09419</a> <span> [<a href="https://arxiv.org/pdf/2401.09419">pdf</a>, <a href="https://arxiv.org/format/2401.09419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> GARField: Group Anything with Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+C+M">Chung Min Kim</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mingxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Kerr%2C+J">Justin Kerr</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+K">Ken Goldberg</a>, <a href="/search/cs?searchtype=author&query=Tancik%2C+M">Matthew Tancik</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.09419v1-abstract-short" style="display: inline;"> Grouping is inherently ambiguous due to the multiple levels of granularity in which one can decompose a scene -- should the wheels of an excavator be considered separate or part of the whole? We present Group Anything with Radiance Fields (GARField), an approach for decomposing 3D scenes into a hierarchy of semantically meaningful groups from posed image inputs. To do this we embrace group ambigui… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09419v1-abstract-full').style.display = 'inline'; document.getElementById('2401.09419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.09419v1-abstract-full" style="display: none;"> Grouping is inherently ambiguous due to the multiple levels of granularity in which one can decompose a scene -- should the wheels of an excavator be considered separate or part of the whole? We present Group Anything with Radiance Fields (GARField), an approach for decomposing 3D scenes into a hierarchy of semantically meaningful groups from posed image inputs. To do this we embrace group ambiguity through physical scale: by optimizing a scale-conditioned 3D affinity feature field, a point in the world can belong to different groups of different sizes. We optimize this field from a set of 2D masks provided by Segment Anything (SAM) in a way that respects coarse-to-fine hierarchy, using scale to consistently fuse conflicting masks from different viewpoints. From this field we can derive a hierarchy of possible groupings via automatic tree construction or user interaction. We evaluate GARField on a variety of in-the-wild scenes and find it effectively extracts groups at many levels: clusters of objects, objects, and various subparts. GARField inherently represents multi-view consistent groupings and produces higher fidelity groups than the input SAM masks. GARField's hierarchical grouping could have exciting downstream applications such as 3D asset extraction or dynamic scene understanding. See the project website at https://www.garfield.studio/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09419v1-abstract-full').style.display = 'none'; document.getElementById('2401.09419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project site: https://www.garfield.studio/ First three authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.01885">arXiv:2401.01885</a> <span> [<a href="https://arxiv.org/pdf/2401.01885">pdf</a>, <a href="https://arxiv.org/format/2401.01885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From Audio to Photoreal Embodiment: Synthesizing Humans in Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ng%2C+E">Evonne Ng</a>, <a href="/search/cs?searchtype=author&query=Romero%2C+J">Javier Romero</a>, <a href="/search/cs?searchtype=author&query=Bagautdinov%2C+T">Timur Bagautdinov</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+S">Shaojie Bai</a>, <a href="/search/cs?searchtype=author&query=Darrell%2C+T">Trevor Darrell</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Richard%2C+A">Alexander Richard</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.01885v1-abstract-short" style="display: inline;"> We present a framework for generating full-bodied photorealistic avatars that gesture according to the conversational dynamics of a dyadic interaction. Given speech audio, we output multiple possibilities of gestural motion for an individual, including face, body, and hands. The key behind our method is in combining the benefits of sample diversity from vector quantization with the high-frequency… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01885v1-abstract-full').style.display = 'inline'; document.getElementById('2401.01885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.01885v1-abstract-full" style="display: none;"> We present a framework for generating full-bodied photorealistic avatars that gesture according to the conversational dynamics of a dyadic interaction. Given speech audio, we output multiple possibilities of gestural motion for an individual, including face, body, and hands. The key behind our method is in combining the benefits of sample diversity from vector quantization with the high-frequency details obtained through diffusion to generate more dynamic, expressive motion. We visualize the generated motion using highly photorealistic avatars that can express crucial nuances in gestures (e.g. sneers and smirks). To facilitate this line of research, we introduce a first-of-its-kind multi-view conversational dataset that allows for photorealistic reconstruction. Experiments show our model generates appropriate and diverse gestures, outperforming both diffusion- and VQ-only methods. Furthermore, our perceptual evaluation highlights the importance of photorealism (vs. meshes) in accurately assessing subtle motion details in conversational gestures. Code and dataset available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01885v1-abstract-full').style.display = 'none'; document.getElementById('2401.01885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05251">arXiv:2312.05251</a> <span> [<a href="https://arxiv.org/pdf/2312.05251">pdf</a>, <a href="https://arxiv.org/format/2312.05251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reconstructing Hands in 3D with Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pavlakos%2C+G">Georgios Pavlakos</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+D">Dandan Shan</a>, <a href="/search/cs?searchtype=author&query=Radosavovic%2C+I">Ilija Radosavovic</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Fouhey%2C+D">David Fouhey</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+J">Jitendra Malik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05251v1-abstract-short" style="display: inline;"> We present an approach that can reconstruct hands in 3D from monocular input. Our approach for Hand Mesh Recovery, HaMeR, follows a fully transformer-based architecture and can analyze hands with significantly increased accuracy and robustness compared to previous work. The key to HaMeR's success lies in scaling up both the data used for training and the capacity of the deep network for hand recon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05251v1-abstract-full').style.display = 'inline'; document.getElementById('2312.05251v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05251v1-abstract-full" style="display: none;"> We present an approach that can reconstruct hands in 3D from monocular input. Our approach for Hand Mesh Recovery, HaMeR, follows a fully transformer-based architecture and can analyze hands with significantly increased accuracy and robustness compared to previous work. The key to HaMeR's success lies in scaling up both the data used for training and the capacity of the deep network for hand reconstruction. For training data, we combine multiple datasets that contain 2D or 3D hand annotations. For the deep model, we use a large scale Vision Transformer architecture. Our final model consistently outperforms the previous baselines on popular 3D hand pose benchmarks. To further evaluate the effect of our design in non-controlled settings, we annotate existing in-the-wild datasets with 2D hand keypoint annotations. On this newly collected dataset of annotations, HInt, we demonstrate significant improvements over existing baselines. We make our code, data and models available on the project website: https://geopavlakos.github.io/hamer/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05251v1-abstract-full').style.display = 'none'; document.getElementById('2312.05251v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04560">arXiv:2312.04560</a> <span> [<a href="https://arxiv.org/pdf/2312.04560">pdf</a>, <a href="https://arxiv.org/format/2312.04560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> NeRFiller: Completing Scenes via Generative 3D Inpainting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weber%2C+E">Ethan Weber</a>, <a href="/search/cs?searchtype=author&query=Ho%C5%82y%C5%84ski%2C+A">Aleksander Ho艂y艅ski</a>, <a href="/search/cs?searchtype=author&query=Jampani%2C+V">Varun Jampani</a>, <a href="/search/cs?searchtype=author&query=Saxena%2C+S">Saurabh Saxena</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a>, <a href="/search/cs?searchtype=author&query=Kar%2C+A">Abhishek Kar</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04560v1-abstract-short" style="display: inline;"> We propose NeRFiller, an approach that completes missing portions of a 3D capture via generative 3D inpainting using off-the-shelf 2D visual generative models. Often parts of a captured 3D scene or object are missing due to mesh reconstruction failures or a lack of observations (e.g., contact regions, such as the bottom of objects, or hard-to-reach areas). We approach this challenging 3D inpaintin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04560v1-abstract-full').style.display = 'inline'; document.getElementById('2312.04560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04560v1-abstract-full" style="display: none;"> We propose NeRFiller, an approach that completes missing portions of a 3D capture via generative 3D inpainting using off-the-shelf 2D visual generative models. Often parts of a captured 3D scene or object are missing due to mesh reconstruction failures or a lack of observations (e.g., contact regions, such as the bottom of objects, or hard-to-reach areas). We approach this challenging 3D inpainting problem by leveraging a 2D inpainting diffusion model. We identify a surprising behavior of these models, where they generate more 3D consistent inpaints when images form a 2$\times$2 grid, and show how to generalize this behavior to more than four images. We then present an iterative framework to distill these inpainted regions into a single consistent 3D scene. In contrast to related works, we focus on completing scenes rather than deleting foreground objects, and our approach does not require tight 2D object masks or text. We compare our approach to relevant baselines adapted to our setting on a variety of scenes, where NeRFiller creates the most 3D consistent and plausible scene completions. Our project page is at https://ethanweber.me/nerfiller. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04560v1-abstract-full').style.display = 'none'; document.getElementById('2312.04560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://ethanweber.me/nerfiller</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.02121">arXiv:2312.02121</a> <span> [<a href="https://arxiv.org/pdf/2312.02121">pdf</a>, <a href="https://arxiv.org/ps/2312.02121">ps</a>, <a href="https://arxiv.org/format/2312.02121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Mathematical Software">cs.MS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Mathematical Supplement for the $\texttt{gsplat}$ Library </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+V">Vickie Ye</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.02121v1-abstract-short" style="display: inline;"> This report provides the mathematical details of the gsplat library, a modular toolbox for efficient differentiable Gaussian splatting, as proposed by Kerbl et al. It provides a self-contained reference for the computations involved in the forward and backward passes of differentiable Gaussian splatting. To facilitate practical usage and development, we provide a user friendly Python API that expo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02121v1-abstract-full').style.display = 'inline'; document.getElementById('2312.02121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.02121v1-abstract-full" style="display: none;"> This report provides the mathematical details of the gsplat library, a modular toolbox for efficient differentiable Gaussian splatting, as proposed by Kerbl et al. It provides a self-contained reference for the computations involved in the forward and backward passes of differentiable Gaussian splatting. To facilitate practical usage and development, we provide a user friendly Python API that exposes each component of the forward and backward passes in rasterization at github.com/nerfstudio-project/gsplat . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02121v1-abstract-full').style.display = 'none'; document.getElementById('2312.02121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Find the library at: https://docs.gsplat.studio/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.07204">arXiv:2310.07204</a> <span> [<a href="https://arxiv.org/pdf/2310.07204">pdf</a>, <a href="https://arxiv.org/format/2310.07204">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> State of the Art on Diffusion Models for Visual Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Po%2C+R">Ryan Po</a>, <a href="/search/cs?searchtype=author&query=Yifan%2C+W">Wang Yifan</a>, <a href="/search/cs?searchtype=author&query=Golyanik%2C+V">Vladislav Golyanik</a>, <a href="/search/cs?searchtype=author&query=Aberman%2C+K">Kfir Aberman</a>, <a href="/search/cs?searchtype=author&query=Barron%2C+J+T">Jonathan T. Barron</a>, <a href="/search/cs?searchtype=author&query=Bermano%2C+A+H">Amit H. Bermano</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+E+R">Eric Ryan Chan</a>, <a href="/search/cs?searchtype=author&query=Dekel%2C+T">Tali Dekel</a>, <a href="/search/cs?searchtype=author&query=Holynski%2C+A">Aleksander Holynski</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C+K">C. Karen Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lingjie Liu</a>, <a href="/search/cs?searchtype=author&query=Mildenhall%2C+B">Ben Mildenhall</a>, <a href="/search/cs?searchtype=author&query=Nie%C3%9Fner%2C+M">Matthias Nie脽ner</a>, <a href="/search/cs?searchtype=author&query=Ommer%2C+B">Bj枚rn Ommer</a>, <a href="/search/cs?searchtype=author&query=Theobalt%2C+C">Christian Theobalt</a>, <a href="/search/cs?searchtype=author&query=Wonka%2C+P">Peter Wonka</a>, <a href="/search/cs?searchtype=author&query=Wetzstein%2C+G">Gordon Wetzstein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.07204v1-abstract-short" style="display: inline;"> The field of visual computing is rapidly advancing due to the emergence of generative artificial intelligence (AI), which unlocks unprecedented capabilities for the generation, editing, and reconstruction of images, videos, and 3D scenes. In these domains, diffusion models are the generative AI architecture of choice. Within the last year alone, the literature on diffusion-based tools and applicat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07204v1-abstract-full').style.display = 'inline'; document.getElementById('2310.07204v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.07204v1-abstract-full" style="display: none;"> The field of visual computing is rapidly advancing due to the emergence of generative artificial intelligence (AI), which unlocks unprecedented capabilities for the generation, editing, and reconstruction of images, videos, and 3D scenes. In these domains, diffusion models are the generative AI architecture of choice. Within the last year alone, the literature on diffusion-based tools and applications has seen exponential growth and relevant papers are published across the computer graphics, computer vision, and AI communities with new works appearing daily on arXiv. This rapid growth of the field makes it difficult to keep up with all recent developments. The goal of this state-of-the-art report (STAR) is to introduce the basic mathematical concepts of diffusion models, implementation details and design choices of the popular Stable Diffusion model, as well as overview important aspects of these generative AI tools, including personalization, conditioning, inversion, among others. Moreover, we give a comprehensive overview of the rapidly growing literature on diffusion-based generation and editing, categorized by the type of generated medium, including 2D images, videos, 3D objects, locomotion, and 4D scenes. Finally, we discuss available datasets, metrics, open challenges, and social implications. This STAR provides an intuitive starting point to explore this exciting topic for researchers, artists, and practitioners alike. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07204v1-abstract-full').style.display = 'none'; document.getElementById('2310.07204v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.07970">arXiv:2309.07970</a> <span> [<a href="https://arxiv.org/pdf/2309.07970">pdf</a>, <a href="https://arxiv.org/format/2309.07970">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Language Embedded Radiance Fields for Zero-Shot Task-Oriented Grasping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rashid%2C+A">Adam Rashid</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+S">Satvik Sharma</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+C+M">Chung Min Kim</a>, <a href="/search/cs?searchtype=author&query=Kerr%2C+J">Justin Kerr</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lawrence Chen</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+K">Ken Goldberg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.07970v2-abstract-short" style="display: inline;"> Grasping objects by a specific part is often crucial for safety and for executing downstream tasks. Yet, learning-based grasp planners lack this behavior unless they are trained on specific object part data, making it a significant challenge to scale object diversity. Instead, we propose LERF-TOGO, Language Embedded Radiance Fields for Task-Oriented Grasping of Objects, which uses vision-language… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07970v2-abstract-full').style.display = 'inline'; document.getElementById('2309.07970v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.07970v2-abstract-full" style="display: none;"> Grasping objects by a specific part is often crucial for safety and for executing downstream tasks. Yet, learning-based grasp planners lack this behavior unless they are trained on specific object part data, making it a significant challenge to scale object diversity. Instead, we propose LERF-TOGO, Language Embedded Radiance Fields for Task-Oriented Grasping of Objects, which uses vision-language models zero-shot to output a grasp distribution over an object given a natural language query. To accomplish this, we first reconstruct a LERF of the scene, which distills CLIP embeddings into a multi-scale 3D language field queryable with text. However, LERF has no sense of objectness, meaning its relevancy outputs often return incomplete activations over an object which are insufficient for subsequent part queries. LERF-TOGO mitigates this lack of spatial grouping by extracting a 3D object mask via DINO features and then conditionally querying LERF on this mask to obtain a semantic distribution over the object with which to rank grasps from an off-the-shelf grasp planner. We evaluate LERF-TOGO's ability to grasp task-oriented object parts on 31 different physical objects, and find it selects grasps on the correct part in 81% of all trials and grasps successfully in 69%. See the project website at: lerftogo.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07970v2-abstract-full').style.display = 'none'; document.getElementById('2309.07970v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">See the project website at: lerftogo.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.10897">arXiv:2308.10897</a> <span> [<a href="https://arxiv.org/pdf/2308.10897">pdf</a>, <a href="https://arxiv.org/format/2308.10897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Can Language Models Learn to Listen? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ng%2C+E">Evonne Ng</a>, <a href="/search/cs?searchtype=author&query=Subramanian%2C+S">Sanjay Subramanian</a>, <a href="/search/cs?searchtype=author&query=Klein%2C+D">Dan Klein</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Darrell%2C+T">Trevor Darrell</a>, <a href="/search/cs?searchtype=author&query=Ginosar%2C+S">Shiry Ginosar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.10897v1-abstract-short" style="display: inline;"> We present a framework for generating appropriate facial responses from a listener in dyadic social interactions based on the speaker's words. Given an input transcription of the speaker's words with their timestamps, our approach autoregressively predicts a response of a listener: a sequence of listener facial gestures, quantized using a VQ-VAE. Since gesture is a language component, we propose t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10897v1-abstract-full').style.display = 'inline'; document.getElementById('2308.10897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.10897v1-abstract-full" style="display: none;"> We present a framework for generating appropriate facial responses from a listener in dyadic social interactions based on the speaker's words. Given an input transcription of the speaker's words with their timestamps, our approach autoregressively predicts a response of a listener: a sequence of listener facial gestures, quantized using a VQ-VAE. Since gesture is a language component, we propose treating the quantized atomic motion elements as additional language token inputs to a transformer-based large language model. Initializing our transformer with the weights of a language model pre-trained only on text results in significantly higher quality listener responses than training a transformer from scratch. We show that our generated listener motion is fluent and reflective of language semantics through quantitative metrics and a qualitative user study. In our evaluation, we analyze the model's ability to utilize temporal and semantic aspects of spoken text. Project page: https://people.eecs.berkeley.edu/~evonne_ng/projects/text2listen/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10897v1-abstract-full').style.display = 'none'; document.getElementById('2308.10897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2023; Project page: https://people.eecs.berkeley.edu/~evonne_ng/projects/text2listen/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.05473">arXiv:2307.05473</a> <span> [<a href="https://arxiv.org/pdf/2307.05473">pdf</a>, <a href="https://arxiv.org/format/2307.05473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Differentiable Blocks World: Qualitative 3D Decomposition by Rendering Primitives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Monnier%2C+T">Tom Monnier</a>, <a href="/search/cs?searchtype=author&query=Austin%2C+J">Jake Austin</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Efros%2C+A+A">Alexei A. Efros</a>, <a href="/search/cs?searchtype=author&query=Aubry%2C+M">Mathieu Aubry</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.05473v2-abstract-short" style="display: inline;"> Given a set of calibrated images of a scene, we present an approach that produces a simple, compact, and actionable 3D world representation by means of 3D primitives. While many approaches focus on recovering high-fidelity 3D scenes, we focus on parsing a scene into mid-level 3D representations made of a small set of textured primitives. Such representations are interpretable, easy to manipulate a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.05473v2-abstract-full').style.display = 'inline'; document.getElementById('2307.05473v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.05473v2-abstract-full" style="display: none;"> Given a set of calibrated images of a scene, we present an approach that produces a simple, compact, and actionable 3D world representation by means of 3D primitives. While many approaches focus on recovering high-fidelity 3D scenes, we focus on parsing a scene into mid-level 3D representations made of a small set of textured primitives. Such representations are interpretable, easy to manipulate and suited for physics-based simulations. Moreover, unlike existing primitive decomposition methods that rely on 3D input data, our approach operates directly on images through differentiable rendering. Specifically, we model primitives as textured superquadric meshes and optimize their parameters from scratch with an image rendering loss. We highlight the importance of modeling transparency for each primitive, which is critical for optimization and also enables handling varying numbers of primitives. We show that the resulting textured primitives faithfully reconstruct the input images and accurately model the visible 3D points, while providing amodal shape completions of unseen object regions. We compare our approach to the state of the art on diverse scenes from DTU, and demonstrate its robustness on real-life captures from BlendedMVS and Nerfstudio. We also showcase how our results can be used to effortlessly edit a scene or perform physical simulations. Code and video results are available at https://www.tmonnier.com/DBW . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.05473v2-abstract-full').style.display = 'none'; document.getElementById('2307.05473v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project webpage with code and videos: https://www.tmonnier.com/DBW. V2 update includes comparisons based on NeuS, hyperparameter analysis and failure cases</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.09337">arXiv:2306.09337</a> <span> [<a href="https://arxiv.org/pdf/2306.09337">pdf</a>, <a href="https://arxiv.org/format/2306.09337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generative Proxemics: A Prior for 3D Social Interaction from Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=M%C3%BCller%2C+L">Lea M眉ller</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+V">Vickie Ye</a>, <a href="/search/cs?searchtype=author&query=Pavlakos%2C+G">Georgios Pavlakos</a>, <a href="/search/cs?searchtype=author&query=Black%2C+M">Michael Black</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.09337v2-abstract-short" style="display: inline;"> Social interaction is a fundamental aspect of human behavior and communication. The way individuals position themselves in relation to others, also known as proxemics, conveys social cues and affects the dynamics of social interaction. Reconstructing such interaction from images presents challenges because of mutual occlusion and the limited availability of large training datasets. To address this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.09337v2-abstract-full').style.display = 'inline'; document.getElementById('2306.09337v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.09337v2-abstract-full" style="display: none;"> Social interaction is a fundamental aspect of human behavior and communication. The way individuals position themselves in relation to others, also known as proxemics, conveys social cues and affects the dynamics of social interaction. Reconstructing such interaction from images presents challenges because of mutual occlusion and the limited availability of large training datasets. To address this, we present a novel approach that learns a prior over the 3D proxemics two people in close social interaction and demonstrate its use for single-view 3D reconstruction. We start by creating 3D training data of interacting people using image datasets with contact annotations. We then model the proxemics using a novel denoising diffusion model called BUDDI that learns the joint distribution over the poses of two people in close social interaction. Sampling from our generative proxemics model produces realistic 3D human interactions, which we validate through a perceptual study. We use BUDDI in reconstructing two people in close proximity from a single image without any contact annotation via an optimization approach that uses the diffusion model as a prior. Our approach recovers accurate and plausible 3D social interactions from noisy initial estimates, outperforming state-of-the-art methods. Our code, data, and model are availableat our project website at: muelea.github.io/buddi. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.09337v2-abstract-full').style.display = 'none'; document.getElementById('2306.09337v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: muelea.github.io/buddi</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.20091">arXiv:2305.20091</a> <span> [<a href="https://arxiv.org/pdf/2305.20091">pdf</a>, <a href="https://arxiv.org/format/2305.20091">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Humans in 4D: Reconstructing and Tracking Humans with Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Goel%2C+S">Shubham Goel</a>, <a href="/search/cs?searchtype=author&query=Pavlakos%2C+G">Georgios Pavlakos</a>, <a href="/search/cs?searchtype=author&query=Rajasegaran%2C+J">Jathushan Rajasegaran</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+J">Jitendra Malik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.20091v3-abstract-short" style="display: inline;"> We present an approach to reconstruct humans and track them over time. At the core of our approach, we propose a fully "transformerized" version of a network for human mesh recovery. This network, HMR 2.0, advances the state of the art and shows the capability to analyze unusual poses that have in the past been difficult to reconstruct from single images. To analyze video, we use 3D reconstruction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.20091v3-abstract-full').style.display = 'inline'; document.getElementById('2305.20091v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.20091v3-abstract-full" style="display: none;"> We present an approach to reconstruct humans and track them over time. At the core of our approach, we propose a fully "transformerized" version of a network for human mesh recovery. This network, HMR 2.0, advances the state of the art and shows the capability to analyze unusual poses that have in the past been difficult to reconstruct from single images. To analyze video, we use 3D reconstructions from HMR 2.0 as input to a tracking system that operates in 3D. This enables us to deal with multiple people and maintain identities through occlusion events. Our complete approach, 4DHumans, achieves state-of-the-art results for tracking people from monocular video. Furthermore, we demonstrate the effectiveness of HMR 2.0 on the downstream task of action recognition, achieving significant improvements over previous pose-based action recognition approaches. Our code and models are available on the project website: https://shubham-goel.github.io/4dhumans/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.20091v3-abstract-full').style.display = 'none'; document.getElementById('2305.20091v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In ICCV 2023. Project Webpage: https://shubham-goel.github.io/4dhumans/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.04966">arXiv:2305.04966</a> <span> [<a href="https://arxiv.org/pdf/2305.04966">pdf</a>, <a href="https://arxiv.org/format/2305.04966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NerfAcc: Efficient Sampling Accelerates NeRFs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruilong Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hang Gao</a>, <a href="/search/cs?searchtype=author&query=Tancik%2C+M">Matthew Tancik</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.04966v2-abstract-short" style="display: inline;"> Optimizing and rendering Neural Radiance Fields is computationally expensive due to the vast number of samples required by volume rendering. Recent works have included alternative sampling approaches to help accelerate their methods, however, they are often not the focus of the work. In this paper, we investigate and compare multiple sampling approaches and demonstrate that improved sampling is ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04966v2-abstract-full').style.display = 'inline'; document.getElementById('2305.04966v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.04966v2-abstract-full" style="display: none;"> Optimizing and rendering Neural Radiance Fields is computationally expensive due to the vast number of samples required by volume rendering. Recent works have included alternative sampling approaches to help accelerate their methods, however, they are often not the focus of the work. In this paper, we investigate and compare multiple sampling approaches and demonstrate that improved sampling is generally applicable across NeRF variants under an unified concept of transmittance estimator. To facilitate future experiments, we develop NerfAcc, a Python toolbox that provides flexible APIs for incorporating advanced sampling methods into NeRF related methods. We demonstrate its flexibility by showing that it can reduce the training time of several recent NeRF methods by 1.5x to 20x with minimal modifications to the existing codebase. Additionally, highly customized NeRFs, such as Instant-NGP, can be implemented in native PyTorch using NerfAcc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04966v2-abstract-full').style.display = 'none'; document.getElementById('2305.04966v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://www.nerfacc.com</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICCV 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.10532">arXiv:2304.10532</a> <span> [<a href="https://arxiv.org/pdf/2304.10532">pdf</a>, <a href="https://arxiv.org/format/2304.10532">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Nerfbusters: Removing Ghostly Artifacts from Casually Captured NeRFs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Warburg%2C+F">Frederik Warburg</a>, <a href="/search/cs?searchtype=author&query=Weber%2C+E">Ethan Weber</a>, <a href="/search/cs?searchtype=author&query=Tancik%2C+M">Matthew Tancik</a>, <a href="/search/cs?searchtype=author&query=Holynski%2C+A">Aleksander Holynski</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.10532v3-abstract-short" style="display: inline;"> Casually captured Neural Radiance Fields (NeRFs) suffer from artifacts such as floaters or flawed geometry when rendered outside the camera trajectory. Existing evaluation protocols often do not capture these effects, since they usually only assess image quality at every 8th frame of the training capture. To push forward progress in novel-view synthesis, we propose a new dataset and evaluation pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.10532v3-abstract-full').style.display = 'inline'; document.getElementById('2304.10532v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.10532v3-abstract-full" style="display: none;"> Casually captured Neural Radiance Fields (NeRFs) suffer from artifacts such as floaters or flawed geometry when rendered outside the camera trajectory. Existing evaluation protocols often do not capture these effects, since they usually only assess image quality at every 8th frame of the training capture. To push forward progress in novel-view synthesis, we propose a new dataset and evaluation procedure, where two camera trajectories are recorded of the scene: one used for training, and the other for evaluation. In this more challenging in-the-wild setting, we find that existing hand-crafted regularizers do not remove floaters nor improve scene geometry. Thus, we propose a 3D diffusion-based method that leverages local 3D priors and a novel density-based score distillation sampling loss to discourage artifacts during NeRF optimization. We show that this data-driven prior removes floaters and improves scene geometry for casual captures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.10532v3-abstract-full').style.display = 'none'; document.getElementById('2304.10532v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2023, project page: https://ethanweber.me/nerfbusters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.02061">arXiv:2304.02061</a> <span> [<a href="https://arxiv.org/pdf/2304.02061">pdf</a>, <a href="https://arxiv.org/format/2304.02061">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generating Continual Human Motion in Diverse 3D Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mir%2C+A">Aymen Mir</a>, <a href="/search/cs?searchtype=author&query=Puig%2C+X">Xavier Puig</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Pons-Moll%2C+G">Gerard Pons-Moll</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.02061v4-abstract-short" style="display: inline;"> We introduce a method to synthesize animator guided human motion across 3D scenes. Given a set of sparse (3 or 4) joint locations (such as the location of a person's hand and two feet) and a seed motion sequence in a 3D scene, our method generates a plausible motion sequence starting from the seed motion while satisfying the constraints imposed by the provided keypoints. We decompose the continual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.02061v4-abstract-full').style.display = 'inline'; document.getElementById('2304.02061v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.02061v4-abstract-full" style="display: none;"> We introduce a method to synthesize animator guided human motion across 3D scenes. Given a set of sparse (3 or 4) joint locations (such as the location of a person's hand and two feet) and a seed motion sequence in a 3D scene, our method generates a plausible motion sequence starting from the seed motion while satisfying the constraints imposed by the provided keypoints. We decompose the continual motion synthesis problem into walking along paths and transitioning in and out of the actions specified by the keypoints, which enables long generation of motions that satisfy scene constraints without explicitly incorporating scene information. Our method is trained only using scene agnostic mocap data. As a result, our approach is deployable across 3D scenes with various geometries. For achieving plausible continual motion synthesis without drift, our key contribution is to generate motion in a goal-centric canonical coordinate frame where the next immediate target is situated at the origin. Our model can generate long sequences of diverse actions such as grabbing, sitting and leaning chained together in arbitrary order, demonstrated on scenes of varying geometry: HPS, Replica, Matterport, ScanNet and scenes represented using NeRFs. Several experiments demonstrate that our method outperforms existing methods that navigate paths in 3D scenes. For more results we urge the reader to watch our supplementary video available at: https://www.youtube.com/watch?v=0wZgsdyCT4A&t=1s <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.02061v4-abstract-full').style.display = 'none'; document.getElementById('2304.02061v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Webpage: https://virtualhumans.mpi-inf.mpg.de/origin_2/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.01199">arXiv:2304.01199</a> <span> [<a href="https://arxiv.org/pdf/2304.01199">pdf</a>, <a href="https://arxiv.org/format/2304.01199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> On the Benefits of 3D Pose and Tracking for Human Action Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rajasegaran%2C+J">Jathushan Rajasegaran</a>, <a href="/search/cs?searchtype=author&query=Pavlakos%2C+G">Georgios Pavlakos</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Feichtenhofer%2C+C">Christoph Feichtenhofer</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+J">Jitendra Malik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.01199v2-abstract-short" style="display: inline;"> In this work we study the benefits of using tracking and 3D poses for action recognition. To achieve this, we take the Lagrangian view on analysing actions over a trajectory of human motion rather than at a fixed point in space. Taking this stand allows us to use the tracklets of people to predict their actions. In this spirit, first we show the benefits of using 3D pose to infer actions, and stud… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01199v2-abstract-full').style.display = 'inline'; document.getElementById('2304.01199v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.01199v2-abstract-full" style="display: none;"> In this work we study the benefits of using tracking and 3D poses for action recognition. To achieve this, we take the Lagrangian view on analysing actions over a trajectory of human motion rather than at a fixed point in space. Taking this stand allows us to use the tracklets of people to predict their actions. In this spirit, first we show the benefits of using 3D pose to infer actions, and study person-person interactions. Subsequently, we propose a Lagrangian Action Recognition model by fusing 3D pose and contextualized appearance over tracklets. To this end, our method achieves state-of-the-art performance on the AVA v2.2 dataset on both pose only settings and on standard benchmark settings. When reasoning about the action using only pose cues, our pose model achieves +10.0 mAP gain over the corresponding state-of-the-art while our fused model has a gain of +2.8 mAP over the best state-of-the-art model. Code and results are available at: https://brjathu.github.io/LART <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01199v2-abstract-full').style.display = 'none'; document.getElementById('2304.01199v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR2023 (project page: https://brjathu.github.io/LART)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.12789">arXiv:2303.12789</a> <span> [<a href="https://arxiv.org/pdf/2303.12789">pdf</a>, <a href="https://arxiv.org/format/2303.12789">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Instruct-NeRF2NeRF: Editing 3D Scenes with Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haque%2C+A">Ayaan Haque</a>, <a href="/search/cs?searchtype=author&query=Tancik%2C+M">Matthew Tancik</a>, <a href="/search/cs?searchtype=author&query=Efros%2C+A+A">Alexei A. Efros</a>, <a href="/search/cs?searchtype=author&query=Holynski%2C+A">Aleksander Holynski</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.12789v2-abstract-short" style="display: inline;"> We propose a method for editing NeRF scenes with text-instructions. Given a NeRF of a scene and the collection of images used to reconstruct it, our method uses an image-conditioned diffusion model (InstructPix2Pix) to iteratively edit the input images while optimizing the underlying scene, resulting in an optimized 3D scene that respects the edit instruction. We demonstrate that our proposed meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.12789v2-abstract-full').style.display = 'inline'; document.getElementById('2303.12789v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.12789v2-abstract-full" style="display: none;"> We propose a method for editing NeRF scenes with text-instructions. Given a NeRF of a scene and the collection of images used to reconstruct it, our method uses an image-conditioned diffusion model (InstructPix2Pix) to iteratively edit the input images while optimizing the underlying scene, resulting in an optimized 3D scene that respects the edit instruction. We demonstrate that our proposed method is able to edit large-scale, real-world scenes, and is able to accomplish more realistic, targeted edits than prior work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.12789v2-abstract-full').style.display = 'none'; document.getElementById('2303.12789v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://instruct-nerf2nerf.github.io; v1. Revisions to related work and discussion</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.09553">arXiv:2303.09553</a> <span> [<a href="https://arxiv.org/pdf/2303.09553">pdf</a>, <a href="https://arxiv.org/format/2303.09553">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> LERF: Language Embedded Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kerr%2C+J">Justin Kerr</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+C+M">Chung Min Kim</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+K">Ken Goldberg</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Tancik%2C+M">Matthew Tancik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.09553v1-abstract-short" style="display: inline;"> Humans describe the physical world using natural language to refer to specific 3D locations based on a vast range of properties: visual appearance, semantics, abstract associations, or actionable affordances. In this work we propose Language Embedded Radiance Fields (LERFs), a method for grounding language embeddings from off-the-shelf models like CLIP into NeRF, which enable these types of open-e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09553v1-abstract-full').style.display = 'inline'; document.getElementById('2303.09553v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.09553v1-abstract-full" style="display: none;"> Humans describe the physical world using natural language to refer to specific 3D locations based on a vast range of properties: visual appearance, semantics, abstract associations, or actionable affordances. In this work we propose Language Embedded Radiance Fields (LERFs), a method for grounding language embeddings from off-the-shelf models like CLIP into NeRF, which enable these types of open-ended language queries in 3D. LERF learns a dense, multi-scale language field inside NeRF by volume rendering CLIP embeddings along training rays, supervising these embeddings across training views to provide multi-view consistency and smooth the underlying language field. After optimization, LERF can extract 3D relevancy maps for a broad range of language prompts interactively in real-time, which has potential use cases in robotics, understanding vision-language models, and interacting with 3D scenes. LERF enables pixel-aligned, zero-shot queries on the distilled 3D CLIP embeddings without relying on region proposals or masks, supporting long-tail open-vocabulary queries hierarchically across the volume. The project website can be found at https://lerf.io . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09553v1-abstract-full').style.display = 'none'; document.getElementById('2303.09553v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website can be found at https://lerf.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.12827">arXiv:2302.12827</a> <span> [<a href="https://arxiv.org/pdf/2302.12827">pdf</a>, <a href="https://arxiv.org/format/2302.12827">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Decoupling Human and Camera Motion from Videos in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+V">Vickie Ye</a>, <a href="/search/cs?searchtype=author&query=Pavlakos%2C+G">Georgios Pavlakos</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+J">Jitendra Malik</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.12827v2-abstract-short" style="display: inline;"> We propose a method to reconstruct global human trajectories from videos in the wild. Our optimization method decouples the camera and human motion, which allows us to place people in the same world coordinate frame. Most existing methods do not model the camera motion; methods that rely on the background pixels to infer 3D human motion usually require a full scene reconstruction, which is often n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12827v2-abstract-full').style.display = 'inline'; document.getElementById('2302.12827v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.12827v2-abstract-full" style="display: none;"> We propose a method to reconstruct global human trajectories from videos in the wild. Our optimization method decouples the camera and human motion, which allows us to place people in the same world coordinate frame. Most existing methods do not model the camera motion; methods that rely on the background pixels to infer 3D human motion usually require a full scene reconstruction, which is often not possible for in-the-wild videos. However, even when existing SLAM systems cannot recover accurate scene reconstructions, the background pixel motion still provides enough signal to constrain the camera motion. We show that relative camera estimates along with data-driven human motion priors can resolve the scene scale ambiguity and recover global human trajectories. Our method robustly recovers the global 3D trajectories of people in challenging in-the-wild videos, such as PoseTrack. We quantify our improvement over existing methods on 3D human dataset Egobody. We further demonstrate that our recovered camera scale allows us to reason about motion of multiple people in a shared coordinate frame, which improves performance of downstream tracking in PoseTrack. Code and video results can be found at https://vye16.github.io/slahmr. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12827v2-abstract-full').style.display = 'none'; document.getElementById('2302.12827v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project site: https://vye16.github.io/slahmr. CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.04264">arXiv:2302.04264</a> <span> [<a href="https://arxiv.org/pdf/2302.04264">pdf</a>, <a href="https://arxiv.org/format/2302.04264">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3588432.3591516">10.1145/3588432.3591516 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Nerfstudio: A Modular Framework for Neural Radiance Field Development </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tancik%2C+M">Matthew Tancik</a>, <a href="/search/cs?searchtype=author&query=Weber%2C+E">Ethan Weber</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+E">Evonne Ng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruilong Li</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+B">Brent Yi</a>, <a href="/search/cs?searchtype=author&query=Kerr%2C+J">Justin Kerr</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Terrance Wang</a>, <a href="/search/cs?searchtype=author&query=Kristoffersen%2C+A">Alexander Kristoffersen</a>, <a href="/search/cs?searchtype=author&query=Austin%2C+J">Jake Austin</a>, <a href="/search/cs?searchtype=author&query=Salahi%2C+K">Kamyar Salahi</a>, <a href="/search/cs?searchtype=author&query=Ahuja%2C+A">Abhik Ahuja</a>, <a href="/search/cs?searchtype=author&query=McAllister%2C+D">David McAllister</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.04264v4-abstract-short" style="display: inline;"> Neural Radiance Fields (NeRF) are a rapidly growing area of research with wide-ranging applications in computer vision, graphics, robotics, and more. In order to streamline the development and deployment of NeRF research, we propose a modular PyTorch framework, Nerfstudio. Our framework includes plug-and-play components for implementing NeRF-based methods, which make it easy for researchers and pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.04264v4-abstract-full').style.display = 'inline'; document.getElementById('2302.04264v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.04264v4-abstract-full" style="display: none;"> Neural Radiance Fields (NeRF) are a rapidly growing area of research with wide-ranging applications in computer vision, graphics, robotics, and more. In order to streamline the development and deployment of NeRF research, we propose a modular PyTorch framework, Nerfstudio. Our framework includes plug-and-play components for implementing NeRF-based methods, which make it easy for researchers and practitioners to incorporate NeRF into their projects. Additionally, the modular design enables support for extensive real-time visualization tools, streamlined pipelines for importing captured in-the-wild data, and tools for exporting to video, point cloud and mesh representations. The modularity of Nerfstudio enables the development of Nerfacto, our method that combines components from recent papers to achieve a balance between speed and quality, while also remaining flexible to future modifications. To promote community-driven development, all associated code and data are made publicly available with open-source licensing at https://nerf.studio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.04264v4-abstract-full').style.display = 'none'; document.getElementById('2302.04264v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page at https://nerf.studio</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.10241">arXiv:2301.10241</a> <span> [<a href="https://arxiv.org/pdf/2301.10241">pdf</a>, <a href="https://arxiv.org/format/2301.10241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> K-Planes: Explicit Radiance Fields in Space, Time, and Appearance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fridovich-Keil%2C+S">Sara Fridovich-Keil</a>, <a href="/search/cs?searchtype=author&query=Meanti%2C+G">Giacomo Meanti</a>, <a href="/search/cs?searchtype=author&query=Warburg%2C+F">Frederik Warburg</a>, <a href="/search/cs?searchtype=author&query=Recht%2C+B">Benjamin Recht</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.10241v2-abstract-short" style="display: inline;"> We introduce k-planes, a white-box model for radiance fields in arbitrary dimensions. Our model uses d choose 2 planes to represent a d-dimensional scene, providing a seamless way to go from static (d=3) to dynamic (d=4) scenes. This planar factorization makes adding dimension-specific priors easy, e.g. temporal smoothness and multi-resolution spatial structure, and induces a natural decomposition… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10241v2-abstract-full').style.display = 'inline'; document.getElementById('2301.10241v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.10241v2-abstract-full" style="display: none;"> We introduce k-planes, a white-box model for radiance fields in arbitrary dimensions. Our model uses d choose 2 planes to represent a d-dimensional scene, providing a seamless way to go from static (d=3) to dynamic (d=4) scenes. This planar factorization makes adding dimension-specific priors easy, e.g. temporal smoothness and multi-resolution spatial structure, and induces a natural decomposition of static and dynamic components of a scene. We use a linear feature decoder with a learned color basis that yields similar performance as a nonlinear black-box MLP decoder. Across a range of synthetic and real, static and dynamic, fixed and varying appearance scenes, k-planes yields competitive and often state-of-the-art reconstruction fidelity with low memory usage, achieving 1000x compression over a full 4D grid, and fast optimization with a pure PyTorch implementation. For video results and code, please see https://sarafridov.github.io/K-Planes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10241v2-abstract-full').style.display = 'none'; document.getElementById('2301.10241v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page https://sarafridov.github.io/K-Planes/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.13445">arXiv:2210.13445</a> <span> [<a href="https://arxiv.org/pdf/2210.13445">pdf</a>, <a href="https://arxiv.org/format/2210.13445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Monocular Dynamic View Synthesis: A Reality Check </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hang Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruilong Li</a>, <a href="/search/cs?searchtype=author&query=Tulsiani%2C+S">Shubham Tulsiani</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+B">Bryan Russell</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.13445v1-abstract-short" style="display: inline;"> We study the recent progress on dynamic view synthesis (DVS) from monocular video. Though existing approaches have demonstrated impressive results, we show a discrepancy between the practical capture process and the existing experimental protocols, which effectively leaks in multi-view signals during training. We define effective multi-view factors (EMFs) to quantify the amount of multi-view signa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.13445v1-abstract-full').style.display = 'inline'; document.getElementById('2210.13445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.13445v1-abstract-full" style="display: none;"> We study the recent progress on dynamic view synthesis (DVS) from monocular video. Though existing approaches have demonstrated impressive results, we show a discrepancy between the practical capture process and the existing experimental protocols, which effectively leaks in multi-view signals during training. We define effective multi-view factors (EMFs) to quantify the amount of multi-view signal present in the input capture sequence based on the relative camera-scene motion. We introduce two new metrics: co-visibility masked image metrics and correspondence accuracy, which overcome the issue in existing protocols. We also propose a new iPhone dataset that includes more diverse real-life deformation sequences. Using our proposed experimental protocol, we show that the state-of-the-art approaches observe a 1-2 dB drop in masked PSNR in the absence of multi-view cues and 4-5 dB drop when modeling complex motion. Code and data can be found at https://hangg7.com/dycheck. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.13445v1-abstract-full').style.display = 'none'; document.getElementById('2210.13445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2022. Project page: https://hangg7.com/dycheck. Code: https://github.com/KAIR-BAIR/dycheck</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.04847">arXiv:2210.04847</a> <span> [<a href="https://arxiv.org/pdf/2210.04847">pdf</a>, <a href="https://arxiv.org/ps/2210.04847">ps</a>, <a href="https://arxiv.org/format/2210.04847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> NerfAcc: A General NeRF Acceleration Toolbox </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruilong Li</a>, <a href="/search/cs?searchtype=author&query=Tancik%2C+M">Matthew Tancik</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.04847v3-abstract-short" style="display: inline;"> We propose NerfAcc, a toolbox for efficient volumetric rendering of radiance fields. We build on the techniques proposed in Instant-NGP, and extend these techniques to not only support bounded static scenes, but also for dynamic scenes and unbounded scenes. NerfAcc comes with a user-friendly Python API, and is ready for plug-and-play acceleration of most NeRFs. Various examples are provided to sho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.04847v3-abstract-full').style.display = 'inline'; document.getElementById('2210.04847v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.04847v3-abstract-full" style="display: none;"> We propose NerfAcc, a toolbox for efficient volumetric rendering of radiance fields. We build on the techniques proposed in Instant-NGP, and extend these techniques to not only support bounded static scenes, but also for dynamic scenes and unbounded scenes. NerfAcc comes with a user-friendly Python API, and is ready for plug-and-play acceleration of most NeRFs. Various examples are provided to show how to use this toolbox. Code can be found here: https://github.com/KAIR-BAIR/nerfacc. Note this write-up matches with NerfAcc v0.3.5. For the latest features in NerfAcc, please check out our more recent write-up at arXiv:2305.04966 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.04847v3-abstract-full').style.display = 'none'; document.getElementById('2210.04847v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Webpage: https://www.nerfacc.com/; Updated Write-up: arXiv:2305.04966</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.02836">arXiv:2209.02836</a> <span> [<a href="https://arxiv.org/pdf/2209.02836">pdf</a>, <a href="https://arxiv.org/format/2209.02836">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Studying Bias in GANs through the Lens of Race </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Maluleke%2C+V+H">Vongani H. Maluleke</a>, <a href="/search/cs?searchtype=author&query=Thakkar%2C+N">Neerja Thakkar</a>, <a href="/search/cs?searchtype=author&query=Brooks%2C+T">Tim Brooks</a>, <a href="/search/cs?searchtype=author&query=Weber%2C+E">Ethan Weber</a>, <a href="/search/cs?searchtype=author&query=Darrell%2C+T">Trevor Darrell</a>, <a href="/search/cs?searchtype=author&query=Efros%2C+A+A">Alexei A. Efros</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Guillory%2C+D">Devin Guillory</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.02836v2-abstract-short" style="display: inline;"> In this work, we study how the performance and evaluation of generative image models are impacted by the racial composition of their training datasets. By examining and controlling the racial distributions in various training datasets, we are able to observe the impacts of different training distributions on generated image quality and the racial distributions of the generated images. Our results… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.02836v2-abstract-full').style.display = 'inline'; document.getElementById('2209.02836v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.02836v2-abstract-full" style="display: none;"> In this work, we study how the performance and evaluation of generative image models are impacted by the racial composition of their training datasets. By examining and controlling the racial distributions in various training datasets, we are able to observe the impacts of different training distributions on generated image quality and the racial distributions of the generated images. Our results show that the racial compositions of generated images successfully preserve that of the training data. However, we observe that truncation, a technique used to generate higher quality images during inference, exacerbates racial imbalances in the data. Lastly, when examining the relationship between image quality and race, we find that the highest perceived visual quality images of a given race come from a distribution where that race is well-represented, and that annotators consistently prefer generated images of white people over those of Black people. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.02836v2-abstract-full').style.display = 'none'; document.getElementById('2209.02836v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2022. Project Page: https://neerja.me/bias-gans/</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.14279">arXiv:2207.14279</a> <span> [<a href="https://arxiv.org/pdf/2207.14279">pdf</a>, <a href="https://arxiv.org/format/2207.14279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The One Where They Reconstructed 3D Humans and Environments in TV Shows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pavlakos%2C+G">Georgios Pavlakos</a>, <a href="/search/cs?searchtype=author&query=Weber%2C+E">Ethan Weber</a>, <a href="/search/cs?searchtype=author&query=Tancik%2C+M">Matthew Tancik</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.14279v1-abstract-short" style="display: inline;"> TV shows depict a wide variety of human behaviors and have been studied extensively for their potential to be a rich source of data for many applications. However, the majority of the existing work focuses on 2D recognition tasks. In this paper, we make the observation that there is a certain persistence in TV shows, i.e., repetition of the environments and the humans, which makes possible the 3D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.14279v1-abstract-full').style.display = 'inline'; document.getElementById('2207.14279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.14279v1-abstract-full" style="display: none;"> TV shows depict a wide variety of human behaviors and have been studied extensively for their potential to be a rich source of data for many applications. However, the majority of the existing work focuses on 2D recognition tasks. In this paper, we make the observation that there is a certain persistence in TV shows, i.e., repetition of the environments and the humans, which makes possible the 3D reconstruction of this content. Building on this insight, we propose an automatic approach that operates on an entire season of a TV show and aggregates information in 3D; we build a 3D model of the environment, compute camera information, static 3D scene structure and body scale information. Then, we demonstrate how this information acts as rich 3D context that can guide and improve the recovery of 3D human pose and position in these environments. Moreover, we show that reasoning about humans and their environment in 3D enables a broad range of downstream applications: re-identification, gaze estimation, cinematography and image editing. We apply our approach on environments from seven iconic TV shows and perform an extensive evaluation of the proposed system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.14279v1-abstract-full').style.display = 'none'; document.getElementById('2207.14279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2022. Project page: http://ethanweber.me/sitcoms3D/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.11148">arXiv:2207.11148</a> <span> [<a href="https://arxiv.org/pdf/2207.11148">pdf</a>, <a href="https://arxiv.org/format/2207.11148">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> InfiniteNature-Zero: Learning Perpetual View Generation of Natural Scenes from Single Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengqi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianqian Wang</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.11148v1-abstract-short" style="display: inline;"> We present a method for learning to generate unbounded flythrough videos of natural scenes starting from a single view, where this capability is learned from a collection of single photographs, without requiring camera poses or even multiple views of each scene. To achieve this, we propose a novel self-supervised view generation training paradigm, where we sample and rendering virtual camera traje… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.11148v1-abstract-full').style.display = 'inline'; document.getElementById('2207.11148v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.11148v1-abstract-full" style="display: none;"> We present a method for learning to generate unbounded flythrough videos of natural scenes starting from a single view, where this capability is learned from a collection of single photographs, without requiring camera poses or even multiple views of each scene. To achieve this, we propose a novel self-supervised view generation training paradigm, where we sample and rendering virtual camera trajectories, including cyclic ones, allowing our model to learn stable view generation from a collection of single views. At test time, despite never seeing a video during training, our approach can take a single image and generate long camera trajectories comprised of hundreds of new views with realistic and diverse content. We compare our approach with recent state-of-the-art supervised view generation methods that require posed multi-view videos and demonstrate superior performance and synthesis quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.11148v1-abstract-full').style.display = 'none'; document.getElementById('2207.11148v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2022 (Oral Presentation)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.10457">arXiv:2206.10457</a> <span> [<a href="https://arxiv.org/pdf/2206.10457">pdf</a>, <a href="https://arxiv.org/format/2206.10457">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Domain Adaptive 3D Pose Augmentation for In-the-wild Human Mesh Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weng%2C+Z">Zhenzhen Weng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kuan-Chieh Wang</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+S">Serena Yeung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.10457v2-abstract-short" style="display: inline;"> The ability to perceive 3D human bodies from a single image has a multitude of applications ranging from entertainment and robotics to neuroscience and healthcare. A fundamental challenge in human mesh recovery is in collecting the ground truth 3D mesh targets required for training, which requires burdensome motion capturing systems and is often limited to indoor laboratories. As a result, while p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.10457v2-abstract-full').style.display = 'inline'; document.getElementById('2206.10457v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.10457v2-abstract-full" style="display: none;"> The ability to perceive 3D human bodies from a single image has a multitude of applications ranging from entertainment and robotics to neuroscience and healthcare. A fundamental challenge in human mesh recovery is in collecting the ground truth 3D mesh targets required for training, which requires burdensome motion capturing systems and is often limited to indoor laboratories. As a result, while progress is made on benchmark datasets collected in these restrictive settings, models fail to generalize to real-world "in-the-wild" scenarios due to distribution shifts. We propose Domain Adaptive 3D Pose Augmentation (DAPA), a data augmentation method that enhances the model's generalization ability in in-the-wild scenarios. DAPA combines the strength of methods based on synthetic datasets by getting direct supervision from the synthesized meshes, and domain adaptation methods by using ground truth 2D keypoints from the target dataset. We show quantitatively that finetuning with DAPA effectively improves results on benchmarks 3DPW and AGORA. We further demonstrate the utility of DAPA on a challenging dataset curated from videos of real-world parent-child interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.10457v2-abstract-full').style.display = 'none'; document.getElementById('2206.10457v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.08929">arXiv:2206.08929</a> <span> [<a href="https://arxiv.org/pdf/2206.08929">pdf</a>, <a href="https://arxiv.org/format/2206.08929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TAVA: Template-free Animatable Volumetric Actors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruilong Li</a>, <a href="/search/cs?searchtype=author&query=Tanke%2C+J">Julian Tanke</a>, <a href="/search/cs?searchtype=author&query=Vo%2C+M">Minh Vo</a>, <a href="/search/cs?searchtype=author&query=Zollhofer%2C+M">Michael Zollhofer</a>, <a href="/search/cs?searchtype=author&query=Gall%2C+J">Jurgen Gall</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Lassner%2C+C">Christoph Lassner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.08929v2-abstract-short" style="display: inline;"> Coordinate-based volumetric representations have the potential to generate photo-realistic virtual avatars from images. However, virtual avatars also need to be controllable even to a novel pose that may not have been observed. Traditional techniques, such as LBS, provide such a function; yet it usually requires a hand-designed body template, 3D scan data, and limited appearance models. On the oth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08929v2-abstract-full').style.display = 'inline'; document.getElementById('2206.08929v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.08929v2-abstract-full" style="display: none;"> Coordinate-based volumetric representations have the potential to generate photo-realistic virtual avatars from images. However, virtual avatars also need to be controllable even to a novel pose that may not have been observed. Traditional techniques, such as LBS, provide such a function; yet it usually requires a hand-designed body template, 3D scan data, and limited appearance models. On the other hand, neural representation has been shown to be powerful in representing visual details, but are under explored on deforming dynamic articulated actors. In this paper, we propose TAVA, a method to create T emplate-free Animatable Volumetric Actors, based on neural representations. We rely solely on multi-view data and a tracked skeleton to create a volumetric model of an actor, which can be animated at the test time given novel pose. Since TAVA does not require a body template, it is applicable to humans as well as other creatures such as animals. Furthermore, TAVA is designed such that it can recover accurate dense correspondences, making it amenable to content-creation and editing tasks. Through extensive experiments, we demonstrate that the proposed method generalizes well to novel poses as well as unseen views and showcase basic editing capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08929v2-abstract-full').style.display = 'none'; document.getElementById('2206.08929v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/facebookresearch/tava; Project Website: https://www.liruilong.cn/projects/tava/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.08451">arXiv:2204.08451</a> <span> [<a href="https://arxiv.org/pdf/2204.08451">pdf</a>, <a href="https://arxiv.org/format/2204.08451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning to Listen: Modeling Non-Deterministic Dyadic Facial Motion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ng%2C+E">Evonne Ng</a>, <a href="/search/cs?searchtype=author&query=Joo%2C+H">Hanbyul Joo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Liwen Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Darrell%2C+T">Trevor Darrell</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Ginosar%2C+S">Shiry Ginosar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.08451v1-abstract-short" style="display: inline;"> We present a framework for modeling interactional communication in dyadic conversations: given multimodal inputs of a speaker, we autoregressively output multiple possibilities of corresponding listener motion. We combine the motion and speech audio of the speaker using a motion-audio cross attention transformer. Furthermore, we enable non-deterministic prediction by learning a discrete latent rep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.08451v1-abstract-full').style.display = 'inline'; document.getElementById('2204.08451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.08451v1-abstract-full" style="display: none;"> We present a framework for modeling interactional communication in dyadic conversations: given multimodal inputs of a speaker, we autoregressively output multiple possibilities of corresponding listener motion. We combine the motion and speech audio of the speaker using a motion-audio cross attention transformer. Furthermore, we enable non-deterministic prediction by learning a discrete latent representation of realistic listener motion with a novel motion-encoding VQ-VAE. Our method organically captures the multimodal and non-deterministic nature of nonverbal dyadic interactions. Moreover, it produces realistic 3D listener facial motion synchronous with the speaker (see video). We demonstrate that our method outperforms baselines qualitatively and quantitatively via a rich suite of experiments. To facilitate this line of research, we introduce a novel and large in-the-wild dataset of dyadic conversations. Code, data, and videos available at https://evonneng.github.io/learning2listen/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.08451v1-abstract-full').style.display = 'none'; document.getElementById('2204.08451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.07151">arXiv:2204.07151</a> <span> [<a href="https://arxiv.org/pdf/2204.07151">pdf</a>, <a href="https://arxiv.org/format/2204.07151">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deformable Sprites for Unsupervised Video Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+V">Vickie Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengqi Li</a>, <a href="/search/cs?searchtype=author&query=Tucker%2C+R">Richard Tucker</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.07151v1-abstract-short" style="display: inline;"> We describe a method to extract persistent elements of a dynamic scene from an input video. We represent each scene element as a \emph{Deformable Sprite} consisting of three components: 1) a 2D texture image for the entire video, 2) per-frame masks for the element, and 3) non-rigid deformations that map the texture image into each video frame. The resulting decomposition allows for applications su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.07151v1-abstract-full').style.display = 'inline'; document.getElementById('2204.07151v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.07151v1-abstract-full" style="display: none;"> We describe a method to extract persistent elements of a dynamic scene from an input video. We represent each scene element as a \emph{Deformable Sprite} consisting of three components: 1) a 2D texture image for the entire video, 2) per-frame masks for the element, and 3) non-rigid deformations that map the texture image into each video frame. The resulting decomposition allows for applications such as consistent video editing. Deformable Sprites are a type of video auto-encoder model that is optimized on individual videos, and does not require training on a large dataset, nor does it rely on pre-trained models. Moreover, our method does not require object masks or other user input, and discovers moving objects of a wider variety than previous work. We evaluate our approach on standard video datasets and show qualitative results on a diverse array of Internet videos. Code and video results can be found at https://deformable-sprites.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.07151v1-abstract-full').style.display = 'none'; document.getElementById('2204.07151v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2022 Oral. Project Site: https://deformable-sprites.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.05131">arXiv:2112.05131</a> <span> [<a href="https://arxiv.org/pdf/2112.05131">pdf</a>, <a href="https://arxiv.org/format/2112.05131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Plenoxels: Radiance Fields without Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+A">Alex Yu</a>, <a href="/search/cs?searchtype=author&query=Fridovich-Keil%2C+S">Sara Fridovich-Keil</a>, <a href="/search/cs?searchtype=author&query=Tancik%2C+M">Matthew Tancik</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qinhong Chen</a>, <a href="/search/cs?searchtype=author&query=Recht%2C+B">Benjamin Recht</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.05131v1-abstract-short" style="display: inline;"> We introduce Plenoxels (plenoptic voxels), a system for photorealistic view synthesis. Plenoxels represent a scene as a sparse 3D grid with spherical harmonics. This representation can be optimized from calibrated images via gradient methods and regularization without any neural components. On standard, benchmark tasks, Plenoxels are optimized two orders of magnitude faster than Neural Radiance Fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.05131v1-abstract-full').style.display = 'inline'; document.getElementById('2112.05131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.05131v1-abstract-full" style="display: none;"> We introduce Plenoxels (plenoptic voxels), a system for photorealistic view synthesis. Plenoxels represent a scene as a sparse 3D grid with spherical harmonics. This representation can be optimized from calibrated images via gradient methods and regularization without any neural components. On standard, benchmark tasks, Plenoxels are optimized two orders of magnitude faster than Neural Radiance Fields with no loss in visual quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.05131v1-abstract-full').style.display = 'none'; document.getElementById('2112.05131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">For video and code, please see https://alexyu.net/plenoxels</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.04477">arXiv:2112.04477</a> <span> [<a href="https://arxiv.org/pdf/2112.04477">pdf</a>, <a href="https://arxiv.org/format/2112.04477">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tracking People by Predicting 3D Appearance, Location & Pose </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rajasegaran%2C+J">Jathushan Rajasegaran</a>, <a href="/search/cs?searchtype=author&query=Pavlakos%2C+G">Georgios Pavlakos</a>, <a href="/search/cs?searchtype=author&query=Kanazawa%2C+A">Angjoo Kanazawa</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+J">Jitendra Malik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.04477v1-abstract-short" style="display: inline;"> In this paper, we present an approach for tracking people in monocular videos, by predicting their future 3D representations. To achieve this, we first lift people to 3D from a single frame in a robust way. This lifting includes information about the 3D pose of the person, his or her location in the 3D space, and the 3D appearance. As we track a person, we collect 3D observations over time in a tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.04477v1-abstract-full').style.display = 'inline'; document.getElementById('2112.04477v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.04477v1-abstract-full" style="display: none;"> In this paper, we present an approach for tracking people in monocular videos, by predicting their future 3D representations. To achieve this, we first lift people to 3D from a single frame in a robust way. This lifting includes information about the 3D pose of the person, his or her location in the 3D space, and the 3D appearance. As we track a person, we collect 3D observations over time in a tracklet representation. Given the 3D nature of our observations, we build temporal models for each one of the previous attributes. We use these models to predict the future state of the tracklet, including 3D location, 3D appearance, and 3D pose. For a future frame, we compute the similarity between the predicted state of a tracklet and the single frame observations in a probabilistic manner. Association is solved with simple Hungarian matching, and the matches are used to update the respective tracklets. We evaluate our approach on various benchmarks and report state-of-the-art results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.04477v1-abstract-full').style.display = 'none'; document.getElementById('2112.04477v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page : https://brjathu.github.io/PHALP/</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Kanazawa%2C+A&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Kanazawa%2C+A&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Kanazawa%2C+A&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository