Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–44 of 44 results for author: <span class="mathjax">Luan, F</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Luan%2C+F">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Luan, F"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Luan%2C+F&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Luan, F"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12407">arXiv:2501.12407</a> <span> [<a href="https://arxiv.org/pdf/2501.12407">pdf</a>, <a href="https://arxiv.org/format/2501.12407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Streaming Batch Model for Efficient and Fault-Tolerant Heterogeneous Execution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+F+S">Frank Sifei Luan</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Ziming Mao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R+Y">Ron Yifeng Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Charlotte Lin</a>, <a href="/search/cs?searchtype=author&query=Kamsetty%2C+A">Amog Kamsetty</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+C">Cheng Su</a>, <a href="/search/cs?searchtype=author&query=Veeramani%2C+B">Balaji Veeramani</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Scott Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+S">SangBin Cho</a>, <a href="/search/cs?searchtype=author&query=Zinzow%2C+C">Clark Zinzow</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+E">Eric Liang</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Stephanie Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12407v4-abstract-short" style="display: inline;"> While ML model training and inference are both GPU-intensive, CPU-based data processing is often the bottleneck. Distributed data processing systems based on the batch or stream processing models assume homogeneous resource requirements. They excel at CPU-based computation but either under-utilize heterogeneous resources or impose high overheads on failure and reconfiguration. We introduce the str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12407v4-abstract-full').style.display = 'inline'; document.getElementById('2501.12407v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12407v4-abstract-full" style="display: none;"> While ML model training and inference are both GPU-intensive, CPU-based data processing is often the bottleneck. Distributed data processing systems based on the batch or stream processing models assume homogeneous resource requirements. They excel at CPU-based computation but either under-utilize heterogeneous resources or impose high overheads on failure and reconfiguration. We introduce the streaming batch model, a hybrid of the two models that enables efficient and fault-tolerant heterogeneous execution. The key idea is to execute one partition at a time to allow lineage-based recovery with dynamic resource allocation. This enables memory-efficient pipelining across heterogeneous resources, similar to stream processing, but also offers the elasticity and fault tolerance properties of batch processing. We present Ray Data, an implementation of the streaming batch model that improves throughput on heterogeneous batch inference pipelines by 3--8$\times$ compared to traditional batch and stream processing systems. When training Stable Diffusion, Ray Data matches the throughput of single-node ML data loaders while additionally leveraging distributed heterogeneous clusters to further improve training throughput by 31%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12407v4-abstract-full').style.display = 'none'; document.getElementById('2501.12407v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14166">arXiv:2412.14166</a> <span> [<a href="https://arxiv.org/pdf/2412.14166">pdf</a>, <a href="https://arxiv.org/format/2412.14166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MegaSynth: Scaling Up 3D Scene Reconstruction with Synthesized Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanwen Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+D">Desai Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziwen Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Haian Jin</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Z">Zhixin Shu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xin Sun</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qixing Huang</a>, <a href="/search/cs?searchtype=author&query=Pavlakos%2C+G">Georgios Pavlakos</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14166v1-abstract-short" style="display: inline;"> We propose scaling up 3D scene reconstruction by training with synthesized data. At the core of our work is MegaSynth, a procedurally generated 3D dataset comprising 700K scenes - over 50 times larger than the prior real dataset DL3DV - dramatically scaling the training data. To enable scalable data generation, our key idea is eliminating semantic information, removing the need to model complex se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14166v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14166v1-abstract-full" style="display: none;"> We propose scaling up 3D scene reconstruction by training with synthesized data. At the core of our work is MegaSynth, a procedurally generated 3D dataset comprising 700K scenes - over 50 times larger than the prior real dataset DL3DV - dramatically scaling the training data. To enable scalable data generation, our key idea is eliminating semantic information, removing the need to model complex semantic priors such as object affordances and scene composition. Instead, we model scenes with basic spatial structures and geometry primitives, offering scalability. Besides, we control data complexity to facilitate training while loosely aligning it with real-world data distribution to benefit real-world generalization. We explore training LRMs with both MegaSynth and available real data. Experiment results show that joint training or pre-training with MegaSynth improves reconstruction quality by 1.2 to 1.8 dB PSNR across diverse image domains. Moreover, models trained solely on MegaSynth perform comparably to those trained on real data, underscoring the low-level nature of 3D reconstruction. Additionally, we provide an in-depth analysis of MegaSynth's properties for enhancing model capability, training stability, and generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14166v1-abstract-full').style.display = 'none'; document.getElementById('2412.14166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://hwjiang1510.github.io/MegaSynth/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09966">arXiv:2412.09966</a> <span> [<a href="https://arxiv.org/pdf/2412.09966">pdf</a>, <a href="https://arxiv.org/format/2412.09966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EP-CFG: Energy-Preserving Classifier-Free Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09966v1-abstract-short" style="display: inline;"> Classifier-free guidance (CFG) is widely used in diffusion models but often introduces over-contrast and over-saturation artifacts at higher guidance strengths. We present EP-CFG (Energy-Preserving Classifier-Free Guidance), which addresses these issues by preserving the energy distribution of the conditional prediction during the guidance process. Our method simply rescales the energy of the guid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09966v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09966v1-abstract-full" style="display: none;"> Classifier-free guidance (CFG) is widely used in diffusion models but often introduces over-contrast and over-saturation artifacts at higher guidance strengths. We present EP-CFG (Energy-Preserving Classifier-Free Guidance), which addresses these issues by preserving the energy distribution of the conditional prediction during the guidance process. Our method simply rescales the energy of the guided output to match that of the conditional prediction at each denoising step, with an optional robust variant for improved artifact suppression. Through experiments, we show that EP-CFG maintains natural image quality and preserves details across guidance strengths while retaining CFG's semantic alignment benefits, all with minimal computational overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09966v1-abstract-full').style.display = 'none'; document.getElementById('2412.09966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04470">arXiv:2412.04470</a> <span> [<a href="https://arxiv.org/pdf/2412.04470">pdf</a>, <a href="https://arxiv.org/format/2412.04470">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Turbo3D: Ultra-fast Text-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hanzhe Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+T">Tianwei Yin</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yiwei Hu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Tulsiani%2C+S">Shubham Tulsiani</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04470v1-abstract-short" style="display: inline;"> We present Turbo3D, an ultra-fast text-to-3D system capable of generating high-quality Gaussian splatting assets in under one second. Turbo3D employs a rapid 4-step, 4-view diffusion generator and an efficient feed-forward Gaussian reconstructor, both operating in latent space. The 4-step, 4-view generator is a student model distilled through a novel Dual-Teacher approach, which encourages the stu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04470v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04470v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04470v1-abstract-full" style="display: none;"> We present Turbo3D, an ultra-fast text-to-3D system capable of generating high-quality Gaussian splatting assets in under one second. Turbo3D employs a rapid 4-step, 4-view diffusion generator and an efficient feed-forward Gaussian reconstructor, both operating in latent space. The 4-step, 4-view generator is a student model distilled through a novel Dual-Teacher approach, which encourages the student to learn view consistency from a multi-view teacher and photo-realism from a single-view teacher. By shifting the Gaussian reconstructor's inputs from pixel space to latent space, we eliminate the extra image decoding time and halve the transformer sequence length for maximum efficiency. Our method demonstrates superior 3D generation results compared to previous baselines, while operating in a fraction of their runtime. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04470v1-abstract-full').style.display = 'none'; document.getElementById('2412.04470v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://turbo-3d.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03225">arXiv:2412.03225</a> <span> [<a href="https://arxiv.org/pdf/2412.03225">pdf</a>, <a href="https://arxiv.org/format/2412.03225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MaterialPicker: Multi-Modal Material Generation with Diffusion Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaohe Ma</a>, <a href="/search/cs?searchtype=author&query=Deschaintre%2C+V">Valentin Deschaintre</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%A1an%2C+M">Milo拧 Ha拧an</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kun Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hongzhi Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yiwei Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03225v2-abstract-short" style="display: inline;"> High-quality material generation is key for virtual environment authoring and inverse rendering. We propose MaterialPicker, a multi-modal material generator leveraging a Diffusion Transformer (DiT) architecture, improving and simplifying the creation of high-quality materials from text prompts and/or photographs. Our method can generate a material based on an image crop of a material sample, even… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03225v2-abstract-full').style.display = 'inline'; document.getElementById('2412.03225v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03225v2-abstract-full" style="display: none;"> High-quality material generation is key for virtual environment authoring and inverse rendering. We propose MaterialPicker, a multi-modal material generator leveraging a Diffusion Transformer (DiT) architecture, improving and simplifying the creation of high-quality materials from text prompts and/or photographs. Our method can generate a material based on an image crop of a material sample, even if the captured surface is distorted, viewed at an angle or partially occluded, as is often the case in photographs of natural scenes. We further allow the user to specify a text prompt to provide additional guidance for the generation. We finetune a pre-trained DiT-based video generator into a material generator, where each material map is treated as a frame in a video sequence. We evaluate our approach both quantitatively and qualitatively and show that it enables more diverse material generation and better distortion correction than previous work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03225v2-abstract-full').style.display = 'none'; document.getElementById('2412.03225v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01827">arXiv:2412.01827</a> <span> [<a href="https://arxiv.org/pdf/2412.01827">pdf</a>, <a href="https://arxiv.org/format/2412.01827">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RandAR: Decoder-only Autoregressive Visual Generation in Random Orders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+Z">Ziqi Pang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Man%2C+Y">Yunze Man</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Freeman%2C+W+T">William T. Freeman</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu-Xiong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01827v1-abstract-short" style="display: inline;"> We introduce RandAR, a decoder-only visual autoregressive (AR) model capable of generating images in arbitrary token orders. Unlike previous decoder-only AR models that rely on a predefined generation order, RandAR removes this inductive bias, unlocking new capabilities in decoder-only generation. Our essential design enables random order by inserting a "position instruction token" before each ima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01827v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01827v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01827v1-abstract-full" style="display: none;"> We introduce RandAR, a decoder-only visual autoregressive (AR) model capable of generating images in arbitrary token orders. Unlike previous decoder-only AR models that rely on a predefined generation order, RandAR removes this inductive bias, unlocking new capabilities in decoder-only generation. Our essential design enables random order by inserting a "position instruction token" before each image token to be predicted, representing the spatial location of the next image token. Trained on randomly permuted token sequences -- a more challenging task than fixed-order generation, RandAR achieves comparable performance to its conventional raster-order counterpart. More importantly, decoder-only transformers trained from random orders acquire new capabilities. For the efficiency bottleneck of AR models, RandAR adopts parallel decoding with KV-Cache at inference time, enjoying 2.5x acceleration without sacrificing generation quality. Additionally, RandAR supports inpainting, outpainting and resolution extrapolation in a zero-shot manner. We hope RandAR inspires new directions for decoder-only visual generation models and broadens their applications across diverse scenarios. Our project page is at https://rand-ar.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01827v1-abstract-full').style.display = 'none'; document.getElementById('2412.01827v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://rand-ar.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17249">arXiv:2411.17249</a> <span> [<a href="https://arxiv.org/pdf/2411.17249">pdf</a>, <a href="https://arxiv.org/format/2411.17249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Buffer Anytime: Zero-Shot Video Depth and Normal from Image Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kuang%2C+Z">Zhengfei Kuang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yiwei Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+M">Milos Hasan</a>, <a href="/search/cs?searchtype=author&query=Wetzstein%2C+G">Gordon Wetzstein</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17249v1-abstract-short" style="display: inline;"> We present Buffer Anytime, a framework for estimation of depth and normal maps (which we call geometric buffers) from video that eliminates the need for paired video--depth and video--normal training data. Instead of relying on large-scale annotated video datasets, we demonstrate high-quality video buffer estimation by leveraging single-image priors with temporal consistency constraints. Our zero-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17249v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17249v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17249v1-abstract-full" style="display: none;"> We present Buffer Anytime, a framework for estimation of depth and normal maps (which we call geometric buffers) from video that eliminates the need for paired video--depth and video--normal training data. Instead of relying on large-scale annotated video datasets, we demonstrate high-quality video buffer estimation by leveraging single-image priors with temporal consistency constraints. Our zero-shot training strategy combines state-of-the-art image estimation models based on optical flow smoothness through a hybrid loss function, implemented via a lightweight temporal attention architecture. Applied to leading image models like Depth Anything V2 and Marigold-E2E-FT, our approach significantly improves temporal consistency while maintaining accuracy. Experiments show that our method not only outperforms image-based approaches but also achieves results comparable to state-of-the-art video models trained on large-scale paired video datasets, despite using no such paired video data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17249v1-abstract-full').style.display = 'none'; document.getElementById('2411.17249v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14384">arXiv:2411.14384</a> <span> [<a href="https://arxiv.org/pdf/2411.14384">pdf</a>, <a href="https://arxiv.org/format/2411.14384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Baking Gaussian Splatting into Diffusion Denoiser for Fast and Scalable Single-stage Image-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuanhao Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yixun Liang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qing Liu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+Y">Soo Ye Kim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Yuille%2C+A">Alan Yuille</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14384v2-abstract-short" style="display: inline;"> Existing feed-forward image-to-3D methods mainly rely on 2D multi-view diffusion models that cannot guarantee 3D consistency. These methods easily collapse when changing the prompt view direction and mainly handle object-centric prompt images. In this paper, we propose a novel single-stage 3D diffusion model, DiffusionGS, for object and scene generation from a single view. DiffusionGS directly out… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14384v2-abstract-full').style.display = 'inline'; document.getElementById('2411.14384v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14384v2-abstract-full" style="display: none;"> Existing feed-forward image-to-3D methods mainly rely on 2D multi-view diffusion models that cannot guarantee 3D consistency. These methods easily collapse when changing the prompt view direction and mainly handle object-centric prompt images. In this paper, we propose a novel single-stage 3D diffusion model, DiffusionGS, for object and scene generation from a single view. DiffusionGS directly outputs 3D Gaussian point clouds at each timestep to enforce view consistency and allow the model to generate robustly given prompt views of any directions, beyond object-centric inputs. Plus, to improve the capability and generalization ability of DiffusionGS, we scale up 3D training data by developing a scene-object mixed training strategy. Experiments show that our method enjoys better generation quality (2.20 dB higher in PSNR and 23.25 lower in FID) and over 5x faster speed (~6s on an A100 GPU) than SOTA methods. The user study and text-to-3D applications also reveals the practical values of our method. Our Project page at https://caiyuanhao1998.github.io/project/DiffusionGS/ shows the video and interactive generation results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14384v2-abstract-full').style.display = 'none'; document.getElementById('2411.14384v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A novel one-stage 3DGS-based diffusion generates objects and scenes from a single view in ~6 seconds</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13549">arXiv:2411.13549</a> <span> [<a href="https://arxiv.org/pdf/2411.13549">pdf</a>, <a href="https://arxiv.org/format/2411.13549">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generating 3D-Consistent Videos from Unposed Internet Photos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chou%2C+G">Gene Chou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Hariharan%2C+B">Bharath Hariharan</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13549v1-abstract-short" style="display: inline;"> We address the problem of generating videos from unposed internet photos. A handful of input images serve as keyframes, and our model interpolates between them to simulate a path moving between the cameras. Given random images, a model's ability to capture underlying geometry, recognize scene identity, and relate frames in terms of camera position and orientation reflects a fundamental understandi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13549v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13549v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13549v1-abstract-full" style="display: none;"> We address the problem of generating videos from unposed internet photos. A handful of input images serve as keyframes, and our model interpolates between them to simulate a path moving between the cameras. Given random images, a model's ability to capture underlying geometry, recognize scene identity, and relate frames in terms of camera position and orientation reflects a fundamental understanding of 3D structure and scene layout. However, existing video models such as Luma Dream Machine fail at this task. We design a self-supervised method that takes advantage of the consistency of videos and variability of multiview internet photos to train a scalable, 3D-aware video model without any 3D annotations such as camera parameters. We validate that our method outperforms all baselines in terms of geometric and appearance consistency. We also show our model benefits applications that enable camera control, such as 3D Gaussian Splatting. Our results suggest that we can scale up scene-level 3D learning using only 2D data such as videos and multiview internet photos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13549v1-abstract-full').style.display = 'none'; document.getElementById('2411.13549v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02608">arXiv:2411.02608</a> <span> [<a href="https://arxiv.org/pdf/2411.02608">pdf</a>, <a href="https://arxiv.org/format/2411.02608">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SSFold: Learning to Fold Arbitrary Crumpled Cloth Using Graph Dynamics from Human Demonstration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Changshi Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haichuan Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiarui Hu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Feng Luan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yanchao Dong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yanmin Zhou</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bin He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02608v1-abstract-short" style="display: inline;"> Robotic cloth manipulation faces challenges due to the fabric's complex dynamics and the high dimensionality of configuration spaces. Previous methods have largely focused on isolated smoothing or folding tasks and overly reliant on simulations, often failing to bridge the significant sim-to-real gap in deformable object manipulation. To overcome these challenges, we propose a two-stream architect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02608v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02608v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02608v1-abstract-full" style="display: none;"> Robotic cloth manipulation faces challenges due to the fabric's complex dynamics and the high dimensionality of configuration spaces. Previous methods have largely focused on isolated smoothing or folding tasks and overly reliant on simulations, often failing to bridge the significant sim-to-real gap in deformable object manipulation. To overcome these challenges, we propose a two-stream architecture with sequential and spatial pathways, unifying smoothing and folding tasks into a single adaptable policy model that accommodates various cloth types and states. The sequential stream determines the pick and place positions for the cloth, while the spatial stream, using a connectivity dynamics model, constructs a visibility graph from partial point cloud data of the self-occluded cloth, allowing the robot to infer the cloth's full configuration from incomplete observations. To bridge the sim-to-real gap, we utilize a hand tracking detection algorithm to gather and integrate human demonstration data into our novel end-to-end neural network, improving real-world adaptability. Our method, validated on a UR5 robot across four distinct cloth folding tasks with different goal shapes, consistently achieves folded states from arbitrary crumpled initial configurations, with success rates of 99\%, 99\%, 83\%, and 67\%. It outperforms existing state-of-the-art cloth manipulation techniques and demonstrates strong generalization to unseen cloth with diverse colors, shapes, and stiffness in real-world experiments.Videos and source code are available at: https://zcswdt.github.io/SSFold/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02608v1-abstract-full').style.display = 'none'; document.getElementById('2411.02608v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17242">arXiv:2410.17242</a> <span> [<a href="https://arxiv.org/pdf/2410.17242">pdf</a>, <a href="https://arxiv.org/format/2410.17242">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LVSM: A Large View Synthesis Model with Minimal 3D Inductive Bias </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+H">Haian Jin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanwen Jiang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17242v1-abstract-short" style="display: inline;"> We propose the Large View Synthesis Model (LVSM), a novel transformer-based approach for scalable and generalizable novel view synthesis from sparse-view inputs. We introduce two architectures: (1) an encoder-decoder LVSM, which encodes input image tokens into a fixed number of 1D latent tokens, functioning as a fully learned scene representation, and decodes novel-view images from them; and (2) a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17242v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17242v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17242v1-abstract-full" style="display: none;"> We propose the Large View Synthesis Model (LVSM), a novel transformer-based approach for scalable and generalizable novel view synthesis from sparse-view inputs. We introduce two architectures: (1) an encoder-decoder LVSM, which encodes input image tokens into a fixed number of 1D latent tokens, functioning as a fully learned scene representation, and decodes novel-view images from them; and (2) a decoder-only LVSM, which directly maps input images to novel-view outputs, completely eliminating intermediate scene representations. Both models bypass the 3D inductive biases used in previous methods -- from 3D representations (e.g., NeRF, 3DGS) to network designs (e.g., epipolar projections, plane sweeps) -- addressing novel view synthesis with a fully data-driven approach. While the encoder-decoder model offers faster inference due to its independent latent representation, the decoder-only LVSM achieves superior quality, scalability, and zero-shot generalization, outperforming previous state-of-the-art methods by 1.5 to 3.5 dB PSNR. Comprehensive evaluations across multiple datasets demonstrate that both LVSM variants achieve state-of-the-art novel view synthesis quality. Notably, our models surpass all previous methods even with reduced computational resources (1-2 GPUs). Please see our website for more details: https://haian-jin.github.io/projects/LVSM/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17242v1-abstract-full').style.display = 'none'; document.getElementById('2410.17242v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://haian-jin.github.io/projects/LVSM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12781">arXiv:2410.12781</a> <span> [<a href="https://arxiv.org/pdf/2410.12781">pdf</a>, <a href="https://arxiv.org/format/2410.12781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Long-LRM: Long-sequence Large Reconstruction Model for Wide-coverage Gaussian Splats </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ziwen%2C+C">Chen Ziwen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yicong Hong</a>, <a href="/search/cs?searchtype=author&query=Fuxin%2C+L">Li Fuxin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12781v1-abstract-short" style="display: inline;"> We propose Long-LRM, a generalizable 3D Gaussian reconstruction model that is capable of reconstructing a large scene from a long sequence of input images. Specifically, our model can process 32 source images at 960x540 resolution within only 1.3 seconds on a single A100 80G GPU. Our architecture features a mixture of the recent Mamba2 blocks and the classical transformer blocks which allowed many… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12781v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12781v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12781v1-abstract-full" style="display: none;"> We propose Long-LRM, a generalizable 3D Gaussian reconstruction model that is capable of reconstructing a large scene from a long sequence of input images. Specifically, our model can process 32 source images at 960x540 resolution within only 1.3 seconds on a single A100 80G GPU. Our architecture features a mixture of the recent Mamba2 blocks and the classical transformer blocks which allowed many more tokens to be processed than prior work, enhanced by efficient token merging and Gaussian pruning steps that balance between quality and efficiency. Unlike previous feed-forward models that are limited to processing 1~4 input images and can only reconstruct a small portion of a large scene, Long-LRM reconstructs the entire scene in a single feed-forward step. On large-scale scene datasets such as DL3DV-140 and Tanks and Temples, our method achieves performance comparable to optimization-based approaches while being two orders of magnitude more efficient. Project page: https://arthurhero.github.io/projects/llrm <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12781v1-abstract-full').style.display = 'none'; document.getElementById('2410.12781v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06231">arXiv:2410.06231</a> <span> [<a href="https://arxiv.org/pdf/2410.06231">pdf</a>, <a href="https://arxiv.org/format/2410.06231">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RelitLRM: Generative Relightable Radiance for Large Reconstruction Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+Z">Zhengfei Kuang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Haian Jin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yiwei Hu</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+M">Milos Hasan</a>, <a href="/search/cs?searchtype=author&query=Freeman%2C+W+T">William T. Freeman</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06231v2-abstract-short" style="display: inline;"> We propose RelitLRM, a Large Reconstruction Model (LRM) for generating high-quality Gaussian splatting representations of 3D objects under novel illuminations from sparse (4-8) posed images captured under unknown static lighting. Unlike prior inverse rendering methods requiring dense captures and slow optimization, often causing artifacts like incorrect highlights or shadow baking, RelitLRM adopts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06231v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06231v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06231v2-abstract-full" style="display: none;"> We propose RelitLRM, a Large Reconstruction Model (LRM) for generating high-quality Gaussian splatting representations of 3D objects under novel illuminations from sparse (4-8) posed images captured under unknown static lighting. Unlike prior inverse rendering methods requiring dense captures and slow optimization, often causing artifacts like incorrect highlights or shadow baking, RelitLRM adopts a feed-forward transformer-based model with a novel combination of a geometry reconstructor and a relightable appearance generator based on diffusion. The model is trained end-to-end on synthetic multi-view renderings of objects under varying known illuminations. This architecture design enables to effectively decompose geometry and appearance, resolve the ambiguity between material and lighting, and capture the multi-modal distribution of shadows and specularity in the relit appearance. We show our sparse-view feed-forward RelitLRM offers competitive relighting results to state-of-the-art dense-view optimization-based baselines while being significantly faster. Our project page is available at: https://relit-lrm.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06231v2-abstract-full').style.display = 'none'; document.getElementById('2410.06231v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">webpage: https://relit-lrm.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19702">arXiv:2409.19702</a> <span> [<a href="https://arxiv.org/pdf/2409.19702">pdf</a>, <a href="https://arxiv.org/format/2409.19702">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> RNG: Relightable Neural Gaussians </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jiahui Fan</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%A1an%2C+M">Milo拧 Ha拧an</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Beibei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19702v4-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has shown impressive results for the novel view synthesis task, where lighting is assumed to be fixed. However, creating relightable 3D assets, especially for objects with ill-defined shapes (fur, fabric, etc.), remains a challenging task. The decomposition between light, geometry, and material is ambiguous, especially if either smooth surface assumptions or surfacebas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19702v4-abstract-full').style.display = 'inline'; document.getElementById('2409.19702v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19702v4-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has shown impressive results for the novel view synthesis task, where lighting is assumed to be fixed. However, creating relightable 3D assets, especially for objects with ill-defined shapes (fur, fabric, etc.), remains a challenging task. The decomposition between light, geometry, and material is ambiguous, especially if either smooth surface assumptions or surfacebased analytical shading models do not apply. We propose Relightable Neural Gaussians (RNG), a novel 3DGS-based framework that enables the relighting of objects with both hard surfaces or soft boundaries, while avoiding assumptions on the shading model. We condition the radiance at each point on both view and light directions. We also introduce a shadow cue, as well as a depth refinement network to improve shadow accuracy. Finally, we propose a hybrid forward-deferred fitting strategy to balance geometry and appearance quality. Our method achieves significantly faster training (1.3 hours) and rendering (60 frames per second) compared to a prior method based on neural radiance fields and produces higher-quality shadows than a concurrent 3DGS-based method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19702v4-abstract-full').style.display = 'none'; document.getElementById('2409.19702v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submission version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18974">arXiv:2409.18974</a> <span> [<a href="https://arxiv.org/pdf/2409.18974">pdf</a>, <a href="https://arxiv.org/format/2409.18974">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3680528.3687566">10.1145/3680528.3687566 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Neural Product Importance Sampling via Warp Composition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Litalien%2C+J">Joey Litalien</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%A1an%2C+M">Milo拧 Ha拧an</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Mullia%2C+K">Krishna Mullia</a>, <a href="/search/cs?searchtype=author&query=Georgiev%2C+I">Iliyan Georgiev</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18974v2-abstract-short" style="display: inline;"> Achieving high efficiency in modern photorealistic rendering hinges on using Monte Carlo sampling distributions that closely approximate the illumination integral estimated for every pixel. Samples are typically generated from a set of simple distributions, each targeting a different factor in the integrand, which are combined via multiple importance sampling. The resulting mixture distribution ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18974v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18974v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18974v2-abstract-full" style="display: none;"> Achieving high efficiency in modern photorealistic rendering hinges on using Monte Carlo sampling distributions that closely approximate the illumination integral estimated for every pixel. Samples are typically generated from a set of simple distributions, each targeting a different factor in the integrand, which are combined via multiple importance sampling. The resulting mixture distribution can be far from the actual product of all factors, leading to sub-optimal variance even for direct-illumination estimation. We present a learning-based method that uses normalizing flows to efficiently importance sample illumination product integrals, e.g., the product of environment lighting and material terms. Our sampler composes a flow head warp with an emitter tail warp. The small conditional head warp is represented by a neural spline flow, while the large unconditional tail is discretized per environment map and its evaluation is instant. If the conditioning is low-dimensional, the head warp can be also discretized to achieve even better performance. We demonstrate variance reduction over prior methods on a range of applications comprising complex geometry, materials and illumination. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18974v2-abstract-full').style.display = 'none'; document.getElementById('2409.18974v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in ACM SIGGRAPH Asia 2024 Conference Papers. Project page: https://joeylitalien.github.io/publications/warp</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06878">arXiv:2408.06878</a> <span> [<a href="https://arxiv.org/pdf/2408.06878">pdf</a>, <a href="https://arxiv.org/format/2408.06878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> PBIR-NIE: Glossy Object Capture under Non-Distant Lighting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+G">Guangyan Cai</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%A1an%2C+M">Milo拧 Ha拧an</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Georgiev%2C+I">Iliyan Georgiev</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06878v1-abstract-short" style="display: inline;"> Glossy objects present a significant challenge for 3D reconstruction from multi-view input images under natural lighting. In this paper, we introduce PBIR-NIE, an inverse rendering framework designed to holistically capture the geometry, material attributes, and surrounding illumination of such objects. We propose a novel parallax-aware non-distant environment map as a lightweight and efficient li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06878v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06878v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06878v1-abstract-full" style="display: none;"> Glossy objects present a significant challenge for 3D reconstruction from multi-view input images under natural lighting. In this paper, we introduce PBIR-NIE, an inverse rendering framework designed to holistically capture the geometry, material attributes, and surrounding illumination of such objects. We propose a novel parallax-aware non-distant environment map as a lightweight and efficient lighting representation, accurately modeling the near-field background of the scene, which is commonly encountered in real-world capture setups. This feature allows our framework to accommodate complex parallax effects beyond the capabilities of standard infinite-distance environment maps. Our method optimizes an underlying signed distance field (SDF) through physics-based differentiable rendering, seamlessly connecting surface gradients between a triangle mesh and the SDF via neural implicit evolution (NIE). To address the intricacies of highly glossy BRDFs in differentiable rendering, we integrate the antithetic sampling algorithm to mitigate variance in the Monte Carlo gradient estimator. Consequently, our framework exhibits robust capabilities in handling glossy object reconstruction, showcasing superior quality in geometry, relighting, and material estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06878v1-abstract-full').style.display = 'none'; document.getElementById('2408.06878v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19451">arXiv:2407.19451</a> <span> [<a href="https://arxiv.org/pdf/2407.19451">pdf</a>, <a href="https://arxiv.org/format/2407.19451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Perm: A Parametric Representation for Multi-Style 3D Hair Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+C">Chengan He</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xin Sun</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Z">Zhixin Shu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Pirk%2C+S">S枚ren Pirk</a>, <a href="/search/cs?searchtype=author&query=Herrera%2C+J+A+A">Jorge Alejandro Amador Herrera</a>, <a href="/search/cs?searchtype=author&query=Michels%2C+D+L">Dominik L. Michels</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T+Y">Tuanfeng Y. Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/cs?searchtype=author&query=Rushmeier%2C+H">Holly Rushmeier</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19451v4-abstract-short" style="display: inline;"> We present Perm, a learned parametric representation of human 3D hair designed to facilitate various hair-related applications. Unlike previous work that jointly models the global hair structure and local curl patterns, we propose to disentangle them using a PCA-based strand representation in the frequency domain, thereby allowing more precise editing and output control. Specifically, we leverage… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19451v4-abstract-full').style.display = 'inline'; document.getElementById('2407.19451v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19451v4-abstract-full" style="display: none;"> We present Perm, a learned parametric representation of human 3D hair designed to facilitate various hair-related applications. Unlike previous work that jointly models the global hair structure and local curl patterns, we propose to disentangle them using a PCA-based strand representation in the frequency domain, thereby allowing more precise editing and output control. Specifically, we leverage our strand representation to fit and decompose hair geometry textures into low- to high-frequency hair structures, termed guide textures and residual textures, respectively. These decomposed textures are later parameterized with different generative models, emulating common stages in the hair grooming process. We conduct extensive experiments to validate the architecture design of Perm, and finally deploy the trained model as a generic prior to solve task-agnostic problems, further showcasing its flexibility and superiority in tasks such as single-view hair reconstruction, hairstyle editing, and hair-conditioned image generation. More details can be found on our project page: https://cs.yale.edu/homes/che/projects/perm/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19451v4-abstract-full').style.display = 'none'; document.getElementById('2407.19451v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://cs.yale.edu/homes/che/projects/perm/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17774">arXiv:2406.17774</a> <span> [<a href="https://arxiv.org/pdf/2406.17774">pdf</a>, <a href="https://arxiv.org/format/2406.17774">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Fast and Uncertainty-Aware SVBRDF Recovery from Multi-View Capture using Frequency Domain Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wiersma%2C+R">Ruben Wiersma</a>, <a href="/search/cs?searchtype=author&query=Philip%2C+J">Julien Philip</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%A1an%2C+M">Milo拧 Ha拧an</a>, <a href="/search/cs?searchtype=author&query=Mullia%2C+K">Krishna Mullia</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Eisemann%2C+E">Elmar Eisemann</a>, <a href="/search/cs?searchtype=author&query=Deschaintre%2C+V">Valentin Deschaintre</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17774v1-abstract-short" style="display: inline;"> Relightable object acquisition is a key challenge in simplifying digital asset creation. Complete reconstruction of an object typically requires capturing hundreds to thousands of photographs under controlled illumination, with specialized equipment. The recent progress in differentiable rendering improved the quality and accessibility of inverse rendering optimization. Nevertheless, under uncontr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17774v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17774v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17774v1-abstract-full" style="display: none;"> Relightable object acquisition is a key challenge in simplifying digital asset creation. Complete reconstruction of an object typically requires capturing hundreds to thousands of photographs under controlled illumination, with specialized equipment. The recent progress in differentiable rendering improved the quality and accessibility of inverse rendering optimization. Nevertheless, under uncontrolled illumination and unstructured viewpoints, there is no guarantee that the observations contain enough information to reconstruct the appearance properties of the captured object. We thus propose to consider the acquisition process from a signal-processing perspective. Given an object's geometry and a lighting environment, we estimate the properties of the materials on the object's surface in seconds. We do so by leveraging frequency domain analysis, considering the recovery of material properties as a deconvolution, enabling fast error estimation. We then quantify the uncertainty of the estimation, based on the available data, highlighting the areas for which priors or additional samples would be required for improved acquisition quality. We compare our approach to previous work and quantitatively evaluate our results, showing similar quality as previous work in a fraction of the time, and providing key information about the certainty of the results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17774v1-abstract-full').style.display = 'none'; document.getElementById('2406.17774v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://brdf-uncertainty.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07520">arXiv:2406.07520</a> <span> [<a href="https://arxiv.org/pdf/2406.07520">pdf</a>, <a href="https://arxiv.org/format/2406.07520">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Neural Gaffer: Relighting Any Object via Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+H">Haian Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Xiangli%2C+Y">Yuanbo Xiangli</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jin Sun</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07520v3-abstract-short" style="display: inline;"> Single-image relighting is a challenging task that involves reasoning about the complex interplay between geometry, materials, and lighting. Many prior methods either support only specific categories of images, such as portraits, or require special capture conditions, like using a flashlight. Alternatively, some methods explicitly decompose a scene into intrinsic components, such as normals and BR… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07520v3-abstract-full').style.display = 'inline'; document.getElementById('2406.07520v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07520v3-abstract-full" style="display: none;"> Single-image relighting is a challenging task that involves reasoning about the complex interplay between geometry, materials, and lighting. Many prior methods either support only specific categories of images, such as portraits, or require special capture conditions, like using a flashlight. Alternatively, some methods explicitly decompose a scene into intrinsic components, such as normals and BRDFs, which can be inaccurate or under-expressive. In this work, we propose a novel end-to-end 2D relighting diffusion model, called Neural Gaffer, that takes a single image of any object and can synthesize an accurate, high-quality relit image under any novel environmental lighting condition, simply by conditioning an image generator on a target environment map, without an explicit scene decomposition. Our method builds on a pre-trained diffusion model, and fine-tunes it on a synthetic relighting dataset, revealing and harnessing the inherent understanding of lighting present in the diffusion model. We evaluate our model on both synthetic and in-the-wild Internet imagery and demonstrate its advantages in terms of generalization and accuracy. Moreover, by combining with other generative methods, our model enables many downstream 2D tasks, such as text-based relighting and object insertion. Our model can also operate as a strong relighting prior for 3D tasks, such as relighting a radiance field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07520v3-abstract-full').style.display = 'none'; document.getElementById('2406.07520v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Website: https://neural-gaffer.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14847">arXiv:2405.14847</a> <span> [<a href="https://arxiv.org/pdf/2405.14847">pdf</a>, <a href="https://arxiv.org/format/2405.14847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neural Directional Encoding for Efficient and Accurate View-Dependent Appearance Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+L">Liwen Wu</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Georgiev%2C+I">Iliyan Georgiev</a>, <a href="/search/cs?searchtype=author&query=Sunkavalli%2C+K">Kalyan Sunkavalli</a>, <a href="/search/cs?searchtype=author&query=Ramamoorthi%2C+R">Ravi Ramamoorthi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14847v1-abstract-short" style="display: inline;"> Novel-view synthesis of specular objects like shiny metals or glossy paints remains a significant challenge. Not only the glossy appearance but also global illumination effects, including reflections of other objects in the environment, are critical components to faithfully reproduce a scene. In this paper, we present Neural Directional Encoding (NDE), a view-dependent appearance encoding of neura… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14847v1-abstract-full').style.display = 'inline'; document.getElementById('2405.14847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14847v1-abstract-full" style="display: none;"> Novel-view synthesis of specular objects like shiny metals or glossy paints remains a significant challenge. Not only the glossy appearance but also global illumination effects, including reflections of other objects in the environment, are critical components to faithfully reproduce a scene. In this paper, we present Neural Directional Encoding (NDE), a view-dependent appearance encoding of neural radiance fields (NeRF) for rendering specular objects. NDE transfers the concept of feature-grid-based spatial encoding to the angular domain, significantly improving the ability to model high-frequency angular signals. In contrast to previous methods that use encoding functions with only angular input, we additionally cone-trace spatial features to obtain a spatially varying directional encoding, which addresses the challenging interreflection effects. Extensive experiments on both synthetic and real datasets show that a NeRF model with NDE (1) outperforms the state of the art on view synthesis of specular objects, and (2) works with small networks to allow fast (real-time) inference. The project webpage and source code are available at: \url{https://lwwu2.github.io/nde/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14847v1-abstract-full').style.display = 'none'; document.getElementById('2405.14847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00666">arXiv:2405.00666</a> <span> [<a href="https://arxiv.org/pdf/2405.00666">pdf</a>, <a href="https://arxiv.org/format/2405.00666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3641519.3657445">10.1145/3641519.3657445 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> RGB$\leftrightarrow$X: Image decomposition and synthesis using material- and lighting-aware diffusion models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zheng Zeng</a>, <a href="/search/cs?searchtype=author&query=Deschaintre%2C+V">Valentin Deschaintre</a>, <a href="/search/cs?searchtype=author&query=Georgiev%2C+I">Iliyan Georgiev</a>, <a href="/search/cs?searchtype=author&query=Hold-Geoffroy%2C+Y">Yannick Hold-Geoffroy</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yiwei Hu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+L">Ling-Qi Yan</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%A1an%2C+M">Milo拧 Ha拧an</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00666v1-abstract-short" style="display: inline;"> The three areas of realistic forward rendering, per-pixel inverse rendering, and generative image synthesis may seem like separate and unrelated sub-fields of graphics and vision. However, recent work has demonstrated improved estimation of per-pixel intrinsic channels (albedo, roughness, metallicity) based on a diffusion architecture; we call this the RGB$\rightarrow$X problem. We further show th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00666v1-abstract-full').style.display = 'inline'; document.getElementById('2405.00666v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00666v1-abstract-full" style="display: none;"> The three areas of realistic forward rendering, per-pixel inverse rendering, and generative image synthesis may seem like separate and unrelated sub-fields of graphics and vision. However, recent work has demonstrated improved estimation of per-pixel intrinsic channels (albedo, roughness, metallicity) based on a diffusion architecture; we call this the RGB$\rightarrow$X problem. We further show that the reverse problem of synthesizing realistic images given intrinsic channels, X$\rightarrow$RGB, can also be addressed in a diffusion framework. Focusing on the image domain of interior scenes, we introduce an improved diffusion model for RGB$\rightarrow$X, which also estimates lighting, as well as the first diffusion X$\rightarrow$RGB model capable of synthesizing realistic images from (full or partial) intrinsic channels. Our X$\rightarrow$RGB model explores a middle ground between traditional rendering and generative models: we can specify only certain appearance properties that should be followed, and give freedom to the model to hallucinate a plausible version of the rest. This flexibility makes it possible to use a mix of heterogeneous training datasets, which differ in the available channels. We use multiple existing datasets and extend them with our own synthetic and real data, resulting in a model capable of extracting scene properties better than previous work and of generating highly realistic images of interior scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00666v1-abstract-full').style.display = 'none'; document.getElementById('2405.00666v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> SIGGRAPH Conference Papers '24, July 27-August 1, 2024, Denver, CO, USA </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.12385">arXiv:2404.12385</a> <span> [<a href="https://arxiv.org/pdf/2404.12385">pdf</a>, <a href="https://arxiv.org/format/2404.12385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> MeshLRM: Large Reconstruction Model for High-Quality Meshes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xinyue Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Deschaintre%2C+V">Valentin Deschaintre</a>, <a href="/search/cs?searchtype=author&query=Sunkavalli%2C+K">Kalyan Sunkavalli</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.12385v2-abstract-short" style="display: inline;"> We propose MeshLRM, a novel LRM-based approach that can reconstruct a high-quality mesh from merely four input images in less than one second. Different from previous large reconstruction models (LRMs) that focus on NeRF-based reconstruction, MeshLRM incorporates differentiable mesh extraction and rendering within the LRM framework. This allows for end-to-end mesh reconstruction by fine-tuning a p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12385v2-abstract-full').style.display = 'inline'; document.getElementById('2404.12385v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.12385v2-abstract-full" style="display: none;"> We propose MeshLRM, a novel LRM-based approach that can reconstruct a high-quality mesh from merely four input images in less than one second. Different from previous large reconstruction models (LRMs) that focus on NeRF-based reconstruction, MeshLRM incorporates differentiable mesh extraction and rendering within the LRM framework. This allows for end-to-end mesh reconstruction by fine-tuning a pre-trained NeRF LRM with mesh rendering. Moreover, we improve the LRM architecture by simplifying several complex designs in previous LRMs. MeshLRM's NeRF initialization is sequentially trained with low- and high-resolution images; this new LRM training strategy enables significantly faster convergence and thereby leads to better quality with less compute. Our approach achieves state-of-the-art mesh reconstruction from sparse-view inputs and also allows for many downstream applications, including text-to-3D and single-image-to-3D generation. Project page: https://sarahweiii.github.io/meshlrm/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12385v2-abstract-full').style.display = 'none'; document.getElementById('2404.12385v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.11894">arXiv:2404.11894</a> <span> [<a href="https://arxiv.org/pdf/2404.11894">pdf</a>, <a href="https://arxiv.org/format/2404.11894">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Rendering Participating Media Using Path Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+B">Becky Hu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xi Deng</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%A1an%2C+M">Milo拧 Ha拧an</a>, <a href="/search/cs?searchtype=author&query=Marschner%2C+S">Steve Marschner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.11894v1-abstract-short" style="display: inline;"> Rendering volumetric scattering media, including clouds, fog, smoke, and other complex materials, is crucial for realism in computer graphics. Traditional path tracing, while unbiased, requires many long path samples to converge in scenes with scattering media, and a lot of work is wasted by paths that make a negligible contribution to the image. Methods to make better use of the information learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.11894v1-abstract-full').style.display = 'inline'; document.getElementById('2404.11894v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.11894v1-abstract-full" style="display: none;"> Rendering volumetric scattering media, including clouds, fog, smoke, and other complex materials, is crucial for realism in computer graphics. Traditional path tracing, while unbiased, requires many long path samples to converge in scenes with scattering media, and a lot of work is wasted by paths that make a negligible contribution to the image. Methods to make better use of the information learned during path tracing range from photon mapping to radiance caching, but struggle to support the full range of heterogeneous scattering media. This paper introduces a new volumetric rendering algorithm that extends and adapts the previous \emph{path graph} surface rendering algorithm. Our method leverages the information collected through multiple-scattering transport paths to compute lower-noise estimates, increasing computational efficiency by reducing the required sample count. Our key contributions include an extended path graph for participating media and new aggregation and propagation operators for efficient path reuse in volumes. Compared to previous methods, our approach significantly boosts convergence in scenes with challenging volumetric light transport, including heterogeneous media with high scattering albedos and dense, forward-scattering translucent materials, under complex lighting conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.11894v1-abstract-full').style.display = 'none'; document.getElementById('2404.11894v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04526">arXiv:2404.04526</a> <span> [<a href="https://arxiv.org/pdf/2404.04526">pdf</a>, <a href="https://arxiv.org/format/2404.04526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DATENeRF: Depth-Aware Text-based Editing of NeRFs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rojas%2C+S">Sara Rojas</a>, <a href="/search/cs?searchtype=author&query=Philip%2C+J">Julien Philip</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Sunkavall%2C+K">Kalyan Sunkavall</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04526v2-abstract-short" style="display: inline;"> Recent advancements in diffusion models have shown remarkable proficiency in editing 2D images based on text prompts. However, extending these techniques to edit scenes in Neural Radiance Fields (NeRF) is complex, as editing individual 2D frames can result in inconsistencies across multiple views. Our crucial insight is that a NeRF scene's geometry can serve as a bridge to integrate these 2D edits… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04526v2-abstract-full').style.display = 'inline'; document.getElementById('2404.04526v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04526v2-abstract-full" style="display: none;"> Recent advancements in diffusion models have shown remarkable proficiency in editing 2D images based on text prompts. However, extending these techniques to edit scenes in Neural Radiance Fields (NeRF) is complex, as editing individual 2D frames can result in inconsistencies across multiple views. Our crucial insight is that a NeRF scene's geometry can serve as a bridge to integrate these 2D edits. Utilizing this geometry, we employ a depth-conditioned ControlNet to enhance the coherence of each 2D image modification. Moreover, we introduce an inpainting approach that leverages the depth information of NeRF scenes to distribute 2D edits across different images, ensuring robustness against errors and resampling challenges. Our results reveal that this methodology achieves more consistent, lifelike, and detailed edits than existing leading methods for text-driven NeRF scene editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04526v2-abstract-full').style.display = 'none'; document.getElementById('2404.04526v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3D Scene Editing, Neural Rendering, Diffusion Models, Accepted to ECCV24</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ECCV 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.09398">arXiv:2312.09398</a> <span> [<a href="https://arxiv.org/pdf/2312.09398">pdf</a>, <a href="https://arxiv.org/format/2312.09398">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3695866">10.1145/3695866 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> RNA: Relightable Neural Assets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mullia%2C+K">Krishna Mullia</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xin Sun</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%A1an%2C+M">Milo拧 Ha拧an</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.09398v2-abstract-short" style="display: inline;"> High-fidelity 3D assets with materials composed of fibers (including hair), complex layered material shaders, or fine scattering geometry are ubiquitous in high-end realistic rendering applications. Rendering such models is computationally expensive due to heavy shaders and long scattering paths. Moreover, implementing the shading and scattering models is non-trivial and has to be done not only in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09398v2-abstract-full').style.display = 'inline'; document.getElementById('2312.09398v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.09398v2-abstract-full" style="display: none;"> High-fidelity 3D assets with materials composed of fibers (including hair), complex layered material shaders, or fine scattering geometry are ubiquitous in high-end realistic rendering applications. Rendering such models is computationally expensive due to heavy shaders and long scattering paths. Moreover, implementing the shading and scattering models is non-trivial and has to be done not only in the 3D content authoring software (which is necessarily complex), but also in all downstream rendering solutions. For example, web and mobile viewers for complex 3D assets are desirable, but frequently cannot support the full shading complexity allowed by the authoring application. Our goal is to design a neural representation for 3D assets with complex shading that supports full relightability and full integration into existing renderers. We provide an end-to-end shading solution at the first intersection of a ray with the underlying geometry. All shading and scattering is precomputed and included in the neural asset; no multiple scattering paths need to be traced, and no complex shading models need to be implemented to render our assets, beyond a single neural architecture. We combine an MLP decoder with a feature grid. Shading consists of querying a feature vector, followed by an MLP evaluation producing the final reflectance value. Our method provides high-fidelity shading, close to the ground-truth Monte Carlo estimate even at close-up views. We believe our neural assets could be used in practical renderers, providing significant speed-ups and simplifying renderer implementations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09398v2-abstract-full').style.display = 'none'; document.getElementById('2312.09398v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Added Fig. 10 and Section 6.5 - new comparison</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.3.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.12024">arXiv:2311.12024</a> <span> [<a href="https://arxiv.org/pdf/2311.12024">pdf</a>, <a href="https://arxiv.org/format/2311.12024">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PF-LRM: Pose-Free Large Reconstruction Model for Joint Pose and Shape Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yinghao Xu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Sunkavalli%2C+K">Kalyan Sunkavalli</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.12024v2-abstract-short" style="display: inline;"> We propose a Pose-Free Large Reconstruction Model (PF-LRM) for reconstructing a 3D object from a few unposed images even with little visual overlap, while simultaneously estimating the relative camera poses in ~1.3 seconds on a single A100 GPU. PF-LRM is a highly scalable method utilizing the self-attention blocks to exchange information between 3D object tokens and 2D image tokens; we predict a c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12024v2-abstract-full').style.display = 'inline'; document.getElementById('2311.12024v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.12024v2-abstract-full" style="display: none;"> We propose a Pose-Free Large Reconstruction Model (PF-LRM) for reconstructing a 3D object from a few unposed images even with little visual overlap, while simultaneously estimating the relative camera poses in ~1.3 seconds on a single A100 GPU. PF-LRM is a highly scalable method utilizing the self-attention blocks to exchange information between 3D object tokens and 2D image tokens; we predict a coarse point cloud for each view, and then use a differentiable Perspective-n-Point (PnP) solver to obtain camera poses. When trained on a huge amount of multi-view posed data of ~1M objects, PF-LRM shows strong cross-dataset generalization ability, and outperforms baseline methods by a large margin in terms of pose prediction accuracy and 3D reconstruction quality on various unseen evaluation datasets. We also demonstrate our model's applicability in downstream text/image-to-3D task with fast feed-forward inference. Our project website is at: https://totoro97.github.io/pf-lrm . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12024v2-abstract-full').style.display = 'none'; document.getElementById('2311.12024v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://totoro97.github.io/pf-lrm ; add more experiments</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.09217">arXiv:2311.09217</a> <span> [<a href="https://arxiv.org/pdf/2311.09217">pdf</a>, <a href="https://arxiv.org/format/2311.09217">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DMV3D: Denoising Multi-View Diffusion using 3D Large Reconstruction Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yinghao Xu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zifan Shi</a>, <a href="/search/cs?searchtype=author&query=Sunkavalli%2C+K">Kalyan Sunkavalli</a>, <a href="/search/cs?searchtype=author&query=Wetzstein%2C+G">Gordon Wetzstein</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.09217v1-abstract-short" style="display: inline;"> We propose \textbf{DMV3D}, a novel 3D generation approach that uses a transformer-based 3D large reconstruction model to denoise multi-view diffusion. Our reconstruction model incorporates a triplane NeRF representation and can denoise noisy multi-view images via NeRF reconstruction and rendering, achieving single-stage 3D generation in $\sim$30s on single A100 GPU. We train \textbf{DMV3D} on larg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09217v1-abstract-full').style.display = 'inline'; document.getElementById('2311.09217v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.09217v1-abstract-full" style="display: none;"> We propose \textbf{DMV3D}, a novel 3D generation approach that uses a transformer-based 3D large reconstruction model to denoise multi-view diffusion. Our reconstruction model incorporates a triplane NeRF representation and can denoise noisy multi-view images via NeRF reconstruction and rendering, achieving single-stage 3D generation in $\sim$30s on single A100 GPU. We train \textbf{DMV3D} on large-scale multi-view image datasets of highly diverse objects using only image reconstruction losses, without accessing 3D assets. We demonstrate state-of-the-art results for the single-image reconstruction problem where probabilistic modeling of unseen object parts is required for generating diverse reconstructions with sharp textures. We also show high-quality text-to-3D generation results outperforming previous 3D diffusion models. Our project website is at: https://justimyhxu.github.io/projects/dmv3d/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09217v1-abstract-full').style.display = 'none'; document.getElementById('2311.09217v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://justimyhxu.github.io/projects/dmv3d/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.06214">arXiv:2311.06214</a> <span> [<a href="https://arxiv.org/pdf/2311.06214">pdf</a>, <a href="https://arxiv.org/format/2311.06214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Instant3D: Fast Text-to-3D with Sparse-View Generation and Large Reconstruction Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yinghao Xu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yicong Hong</a>, <a href="/search/cs?searchtype=author&query=Sunkavalli%2C+K">Kalyan Sunkavalli</a>, <a href="/search/cs?searchtype=author&query=Shakhnarovich%2C+G">Greg Shakhnarovich</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.06214v2-abstract-short" style="display: inline;"> Text-to-3D with diffusion models has achieved remarkable progress in recent years. However, existing methods either rely on score distillation-based optimization which suffer from slow inference, low diversity and Janus problems, or are feed-forward methods that generate low-quality results due to the scarcity of 3D training data. In this paper, we propose Instant3D, a novel method that generates… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.06214v2-abstract-full').style.display = 'inline'; document.getElementById('2311.06214v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.06214v2-abstract-full" style="display: none;"> Text-to-3D with diffusion models has achieved remarkable progress in recent years. However, existing methods either rely on score distillation-based optimization which suffer from slow inference, low diversity and Janus problems, or are feed-forward methods that generate low-quality results due to the scarcity of 3D training data. In this paper, we propose Instant3D, a novel method that generates high-quality and diverse 3D assets from text prompts in a feed-forward manner. We adopt a two-stage paradigm, which first generates a sparse set of four structured and consistent views from text in one shot with a fine-tuned 2D text-to-image diffusion model, and then directly regresses the NeRF from the generated images with a novel transformer-based sparse-view reconstructor. Through extensive experiments, we demonstrate that our method can generate diverse 3D assets of high visual quality within 20 seconds, which is two orders of magnitude faster than previous optimization-based methods that can take 1 to 10 hours. Our project webpage: https://jiahao.ai/instant3d/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.06214v2-abstract-full').style.display = 'none'; document.getElementById('2311.06214v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project webpage: https://jiahao.ai/instant3d/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.11009">arXiv:2309.11009</a> <span> [<a href="https://arxiv.org/pdf/2309.11009">pdf</a>, <a href="https://arxiv.org/format/2309.11009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Controllable Dynamic Appearance for Neural 3D Portraits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Athar%2C+S">ShahRukh Athar</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Z">Zhixin Shu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Sunkavalli%2C+K">Kalyan Sunkavalli</a>, <a href="/search/cs?searchtype=author&query=Samaras%2C+D">Dimitris Samaras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11009v2-abstract-short" style="display: inline;"> Recent advances in Neural Radiance Fields (NeRFs) have made it possible to reconstruct and reanimate dynamic portrait scenes with control over head-pose, facial expressions and viewing direction. However, training such models assumes photometric consistency over the deformed region e.g. the face must be evenly lit as it deforms with changing head-pose and facial expression. Such photometric consis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11009v2-abstract-full').style.display = 'inline'; document.getElementById('2309.11009v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11009v2-abstract-full" style="display: none;"> Recent advances in Neural Radiance Fields (NeRFs) have made it possible to reconstruct and reanimate dynamic portrait scenes with control over head-pose, facial expressions and viewing direction. However, training such models assumes photometric consistency over the deformed region e.g. the face must be evenly lit as it deforms with changing head-pose and facial expression. Such photometric consistency across frames of a video is hard to maintain, even in studio environments, thus making the created reanimatable neural portraits prone to artifacts during reanimation. In this work, we propose CoDyNeRF, a system that enables the creation of fully controllable 3D portraits in real-world capture conditions. CoDyNeRF learns to approximate illumination dependent effects via a dynamic appearance model in the canonical space that is conditioned on predicted surface normals and the facial expressions and head-pose deformations. The surface normals prediction is guided using 3DMM normals that act as a coarse prior for the normals of the human head, where direct prediction of normals is hard due to rigid and non-rigid deformations induced by head-pose and facial expression changes. Using only a smartphone-captured short video of a subject for training, we demonstrate the effectiveness of our method on free view synthesis of a portrait scene with explicit head pose and expression controls, and realistic lighting effects. The project page can be found here: http://shahrukhathar.github.io/2023/08/22/CoDyNeRF.html <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11009v2-abstract-full').style.display = 'none'; document.getElementById('2309.11009v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.03244">arXiv:2307.03244</a> <span> [<a href="https://arxiv.org/pdf/2307.03244">pdf</a>, <a href="https://arxiv.org/format/2307.03244">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> PSDR-Room: Single Photo to Scene using Differentiable Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+K">Kai Yan</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%A0An%2C+M">Milo艩 Ha艩An</a>, <a href="/search/cs?searchtype=author&query=Groueix%2C+T">Thibault Groueix</a>, <a href="/search/cs?searchtype=author&query=Deschaintre%2C+V">Valentin Deschaintre</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.03244v1-abstract-short" style="display: inline;"> A 3D digital scene contains many components: lights, materials and geometries, interacting to reach the desired appearance. Staging such a scene is time-consuming and requires both artistic and technical skills. In this work, we propose PSDR-Room, a system allowing to optimize lighting as well as the pose and materials of individual objects to match a target image of a room scene, with minimal use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03244v1-abstract-full').style.display = 'inline'; document.getElementById('2307.03244v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.03244v1-abstract-full" style="display: none;"> A 3D digital scene contains many components: lights, materials and geometries, interacting to reach the desired appearance. Staging such a scene is time-consuming and requires both artistic and technical skills. In this work, we propose PSDR-Room, a system allowing to optimize lighting as well as the pose and materials of individual objects to match a target image of a room scene, with minimal user input. To this end, we leverage a recent path-space differentiable rendering approach that provides unbiased gradients of the rendering with respect to geometry, lighting, and procedural materials, allowing us to optimize all of these components using gradient descent to visually match the input photo appearance. We use recent single-image scene understanding methods to initialize the optimization and search for appropriate 3D models and materials. We evaluate our method on real photographs of indoor scenes and demonstrate the editability of the resulting scene components. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03244v1-abstract-full').style.display = 'none'; document.getElementById('2307.03244v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.07634">arXiv:2303.07634</a> <span> [<a href="https://arxiv.org/pdf/2303.07634">pdf</a>, <a href="https://arxiv.org/format/2303.07634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> I$^2$-SDF: Intrinsic Indoor Scene Reconstruction and Editing via Raytracing in Neural SDFs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingsen Zhu</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuchi Huo</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Q">Qi Ye</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jifan Li</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+D">Dianbing Xi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lisha Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Rui Tang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+W">Wei Hua</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.07634v2-abstract-short" style="display: inline;"> In this work, we present I$^2$-SDF, a new method for intrinsic indoor scene reconstruction and editing using differentiable Monte Carlo raytracing on neural signed distance fields (SDFs). Our holistic neural SDF-based framework jointly recovers the underlying shapes, incident radiance and materials from multi-view images. We introduce a novel bubble loss for fine-grained small objects and error-gu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.07634v2-abstract-full').style.display = 'inline'; document.getElementById('2303.07634v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.07634v2-abstract-full" style="display: none;"> In this work, we present I$^2$-SDF, a new method for intrinsic indoor scene reconstruction and editing using differentiable Monte Carlo raytracing on neural signed distance fields (SDFs). Our holistic neural SDF-based framework jointly recovers the underlying shapes, incident radiance and materials from multi-view images. We introduce a novel bubble loss for fine-grained small objects and error-guided adaptive sampling scheme to largely improve the reconstruction quality on large-scale indoor scenes. Further, we propose to decompose the neural radiance field into spatially-varying material of the scene as a neural field through surface-based, differentiable Monte Carlo raytracing and emitter semantic segmentations, which enables physically based and photorealistic scene relighting and editing applications. Through a number of qualitative and quantitative experiments, we demonstrate the superior quality of our method on indoor scene reconstruction, novel view synthesis, and scene editing compared to state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.07634v2-abstract-full').style.display = 'none'; document.getElementById('2303.07634v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2023, project page: https://jingsenzhu.github.io/i2-sdf</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.03734">arXiv:2301.03734</a> <span> [<a href="https://arxiv.org/pdf/2301.03734">pdf</a>, <a href="https://arxiv.org/format/2301.03734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Operating Systems">cs.OS</span> </div> </div> <p class="title is-5 mathjax"> Exoshuffle-CloudSort </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+F+S">Frank Sifei Luan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Stephanie Wang</a>, <a href="/search/cs?searchtype=author&query=Yagati%2C+S">Samyukta Yagati</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sean Kim</a>, <a href="/search/cs?searchtype=author&query=Lien%2C+K">Kenneth Lien</a>, <a href="/search/cs?searchtype=author&query=Ong%2C+I">Isaac Ong</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+T">Tony Hong</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+S">SangBin Cho</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+E">Eric Liang</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.03734v1-abstract-short" style="display: inline;"> We present Exoshuffle-CloudSort, a sorting application running on top of Ray using the Exoshuffle architecture. Exoshuffle-CloudSort runs on Amazon EC2, with input and output data stored on Amazon S3. Using 40 i4i.4xlarge workers, Exoshuffle-CloudSort completes the 100 TB CloudSort Benchmark (Indy category) in 5378 seconds, with an average total cost of $97. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.03734v1-abstract-full" style="display: none;"> We present Exoshuffle-CloudSort, a sorting application running on top of Ray using the Exoshuffle architecture. Exoshuffle-CloudSort runs on Amazon EC2, with input and output data stored on Amazon S3. Using 40 i4i.4xlarge workers, Exoshuffle-CloudSort completes the 100 TB CloudSort Benchmark (Indy category) in 5378 seconds, with an average total cost of $97. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.03734v1-abstract-full').style.display = 'none'; document.getElementById('2301.03734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.10699">arXiv:2212.10699</a> <span> [<a href="https://arxiv.org/pdf/2212.10699">pdf</a>, <a href="https://arxiv.org/format/2212.10699">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> PaletteNeRF: Palette-based Appearance Editing of Neural Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kuang%2C+Z">Zhengfei Kuang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Z">Zhixin Shu</a>, <a href="/search/cs?searchtype=author&query=Wetzstein%2C+G">Gordon Wetzstein</a>, <a href="/search/cs?searchtype=author&query=Sunkavalli%2C+K">Kalyan Sunkavalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.10699v2-abstract-short" style="display: inline;"> Recent advances in neural radiance fields have enabled the high-fidelity 3D reconstruction of complex scenes for novel view synthesis. However, it remains underexplored how the appearance of such representations can be efficiently edited while maintaining photorealism. In this work, we present PaletteNeRF, a novel method for photorealistic appearance editing of neural radiance fields (NeRF) base… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10699v2-abstract-full').style.display = 'inline'; document.getElementById('2212.10699v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.10699v2-abstract-full" style="display: none;"> Recent advances in neural radiance fields have enabled the high-fidelity 3D reconstruction of complex scenes for novel view synthesis. However, it remains underexplored how the appearance of such representations can be efficiently edited while maintaining photorealism. In this work, we present PaletteNeRF, a novel method for photorealistic appearance editing of neural radiance fields (NeRF) based on 3D color decomposition. Our method decomposes the appearance of each 3D point into a linear combination of palette-based bases (i.e., 3D segmentations defined by a group of NeRF-type functions) that are shared across the scene. While our palette-based bases are view-independent, we also predict a view-dependent function to capture the color residual (e.g., specular shading). During training, we jointly optimize the basis functions and the color palettes, and we also introduce novel regularizers to encourage the spatial coherence of the decomposition. Our method allows users to efficiently edit the appearance of the 3D scene by modifying the color palettes. We also extend our framework with compressed semantic features for semantic-aware appearance editing. We demonstrate that our technique is superior to baseline methods both quantitatively and qualitatively for appearance editing of complex real-world scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10699v2-abstract-full').style.display = 'none'; document.getElementById('2212.10699v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.03017">arXiv:2211.03017</a> <span> [<a href="https://arxiv.org/pdf/2211.03017">pdf</a>, <a href="https://arxiv.org/format/2211.03017">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Learning-based Inverse Rendering of Complex Indoor Scenes with Differentiable Monte Carlo Raytracing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingsen Zhu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuchi Huo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zihao Lin</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zhihua Zhong</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+D">Dianbing Xi</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jiaxiang Zheng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Rui Tang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.03017v2-abstract-short" style="display: inline;"> Indoor scenes typically exhibit complex, spatially-varying appearance from global illumination, making inverse rendering a challenging ill-posed problem. This work presents an end-to-end, learning-based inverse rendering framework incorporating differentiable Monte Carlo raytracing with importance sampling. The framework takes a single image as input to jointly recover the underlying geometry, spa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03017v2-abstract-full').style.display = 'inline'; document.getElementById('2211.03017v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.03017v2-abstract-full" style="display: none;"> Indoor scenes typically exhibit complex, spatially-varying appearance from global illumination, making inverse rendering a challenging ill-posed problem. This work presents an end-to-end, learning-based inverse rendering framework incorporating differentiable Monte Carlo raytracing with importance sampling. The framework takes a single image as input to jointly recover the underlying geometry, spatially-varying lighting, and photorealistic materials. Specifically, we introduce a physically-based differentiable rendering layer with screen-space ray tracing, resulting in more realistic specular reflections that match the input photo. In addition, we create a large-scale, photorealistic indoor scene dataset with significantly richer details like complex furniture and dedicated decorations. Further, we design a novel out-of-view lighting network with uncertainty-aware refinement leveraging hypernetwork-based neural radiance fields to predict lighting outside the view of the input photo. Through extensive evaluations on common benchmark datasets, we demonstrate superior inverse rendering quality of our method compared to state-of-the-art baselines, enabling various applications such as complex object insertion and material editing with high fidelity. Code and data will be made available at \url{https://jingsenzhu.github.io/invrend}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03017v2-abstract-full').style.display = 'none'; document.getElementById('2211.03017v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.06360">arXiv:2206.06360</a> <span> [<a href="https://arxiv.org/pdf/2206.06360">pdf</a>, <a href="https://arxiv.org/format/2206.06360">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ARF: Artistic Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Kolkin%2C+N">Nick Kolkin</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.06360v1-abstract-short" style="display: inline;"> We present a method for transferring the artistic features of an arbitrary style image to a 3D scene. Previous methods that perform 3D stylization on point clouds or meshes are sensitive to geometric reconstruction errors for complex real-world scenes. Instead, we propose to stylize the more robust radiance field representation. We find that the commonly used Gram matrix-based loss tends to produc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.06360v1-abstract-full').style.display = 'inline'; document.getElementById('2206.06360v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.06360v1-abstract-full" style="display: none;"> We present a method for transferring the artistic features of an arbitrary style image to a 3D scene. Previous methods that perform 3D stylization on point clouds or meshes are sensitive to geometric reconstruction errors for complex real-world scenes. Instead, we propose to stylize the more robust radiance field representation. We find that the commonly used Gram matrix-based loss tends to produce blurry results without faithful brushstrokes, and introduce a nearest neighbor-based loss that is highly effective at capturing style details while maintaining multi-view consistency. We also propose a novel deferred back-propagation method to enable optimization of memory-intensive radiance fields using style losses defined on full-resolution rendered images. Our extensive evaluation demonstrates that our method outperforms baselines by generating artistic appearance that more closely resembles the style image. Please check our project page for video results and open-source implementations: https://www.cs.cornell.edu/projects/arf/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.06360v1-abstract-full').style.display = 'none'; document.getElementById('2206.06360v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://www.cs.cornell.edu/projects/arf/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.05344">arXiv:2206.05344</a> <span> [<a href="https://arxiv.org/pdf/2206.05344">pdf</a>, <a href="https://arxiv.org/format/2206.05344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Differentiable Rendering of Neural SDFs through Reparameterization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bangaru%2C+S+P">Sai Praveen Bangaru</a>, <a href="/search/cs?searchtype=author&query=Gharbi%2C+M">Micha毛l Gharbi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tzu-Mao Li</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Sunkavalli%2C+K">Kalyan Sunkavalli</a>, <a href="/search/cs?searchtype=author&query=Ha%C5%A1an%2C+M">Milo拧 Ha拧an</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Bernstein%2C+G">Gilbert Bernstein</a>, <a href="/search/cs?searchtype=author&query=Durand%2C+F">Fr茅do Durand</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.05344v1-abstract-short" style="display: inline;"> We present a method to automatically compute correct gradients with respect to geometric scene parameters in neural SDF renderers. Recent physically-based differentiable rendering techniques for meshes have used edge-sampling to handle discontinuities, particularly at object silhouettes, but SDFs do not have a simple parametric form amenable to sampling. Instead, our approach builds on area-sampli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.05344v1-abstract-full').style.display = 'inline'; document.getElementById('2206.05344v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.05344v1-abstract-full" style="display: none;"> We present a method to automatically compute correct gradients with respect to geometric scene parameters in neural SDF renderers. Recent physically-based differentiable rendering techniques for meshes have used edge-sampling to handle discontinuities, particularly at object silhouettes, but SDFs do not have a simple parametric form amenable to sampling. Instead, our approach builds on area-sampling techniques and develops a continuous warping function for SDFs to account for these discontinuities. Our method leverages the distance to surface encoded in an SDF and uses quadrature on sphere tracer points to compute this warping function. We further show that this can be done by subsampling the points to make the method tractable for neural SDFs. Our differentiable renderer can be used to optimize neural shapes from multi-view images and produces comparable 3D reconstructions to recent SDF-based inverse rendering methods, without the need for 2D segmentation masks to guide the geometry optimization and no volumetric approximations to the geometry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.05344v1-abstract-full').style.display = 'none'; document.getElementById('2206.05344v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.02232">arXiv:2204.02232</a> <span> [<a href="https://arxiv.org/pdf/2204.02232">pdf</a>, <a href="https://arxiv.org/format/2204.02232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IRON: Inverse Rendering by Optimizing Neural SDFs and Materials from Photometric Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengqi Li</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.02232v1-abstract-short" style="display: inline;"> We propose a neural inverse rendering pipeline called IRON that operates on photometric images and outputs high-quality 3D content in the format of triangle meshes and material textures readily deployable in existing graphics pipelines. Our method adopts neural representations for geometry as signed distance fields (SDFs) and materials during optimization to enjoy their flexibility and compactness… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.02232v1-abstract-full').style.display = 'inline'; document.getElementById('2204.02232v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.02232v1-abstract-full" style="display: none;"> We propose a neural inverse rendering pipeline called IRON that operates on photometric images and outputs high-quality 3D content in the format of triangle meshes and material textures readily deployable in existing graphics pipelines. Our method adopts neural representations for geometry as signed distance fields (SDFs) and materials during optimization to enjoy their flexibility and compactness, and features a hybrid optimization scheme for neural SDFs: first, optimize using a volumetric radiance field approach to recover correct topology, then optimize further using edgeaware physics-based surface rendering for geometry refinement and disentanglement of materials and lighting. In the second stage, we also draw inspiration from mesh-based differentiable rendering, and design a novel edge sampling algorithm for neural SDFs to further improve performance. We show that our IRON achieves significantly better inverse rendering quality compared to prior works. Our project page is here: https://kai-46.github.io/IRON-website/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.02232v1-abstract-full').style.display = 'none'; document.getElementById('2204.02232v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2022; Project page is: https://kai-46.github.io/IRON-website/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.05072">arXiv:2203.05072</a> <span> [<a href="https://arxiv.org/pdf/2203.05072">pdf</a>, <a href="https://arxiv.org/format/2203.05072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Exoshuffle: An Extensible Shuffle Architecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+F+S">Frank Sifei Luan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Stephanie Wang</a>, <a href="/search/cs?searchtype=author&query=Yagati%2C+S">Samyukta Yagati</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sean Kim</a>, <a href="/search/cs?searchtype=author&query=Lien%2C+K">Kenneth Lien</a>, <a href="/search/cs?searchtype=author&query=Ong%2C+I">Isaac Ong</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+T">Tony Hong</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+S">SangBin Cho</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+E">Eric Liang</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.05072v5-abstract-short" style="display: inline;"> Shuffle is one of the most expensive communication primitives in distributed data processing and is difficult to scale. Prior work addresses the scalability challenges of shuffle by building monolithic shuffle systems. These systems are costly to develop, and they are tightly integrated with batch processing frameworks that offer only high-level APIs such as SQL. New applications, such as ML train… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.05072v5-abstract-full').style.display = 'inline'; document.getElementById('2203.05072v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.05072v5-abstract-full" style="display: none;"> Shuffle is one of the most expensive communication primitives in distributed data processing and is difficult to scale. Prior work addresses the scalability challenges of shuffle by building monolithic shuffle systems. These systems are costly to develop, and they are tightly integrated with batch processing frameworks that offer only high-level APIs such as SQL. New applications, such as ML training, require more flexibility and finer-grained interoperability with shuffle. They are often unable to leverage existing shuffle optimizations. We propose an extensible shuffle architecture. We present Exoshuffle, a library for distributed shuffle that offers competitive performance and scalability as well as greater flexibility than monolithic shuffle systems. We design an architecture that decouples the shuffle control plane from the data plane without sacrificing performance. We build Exoshuffle on Ray, a distributed futures system for data and ML applications, and demonstrate that we can: (1) rewrite previous shuffle optimizations as application-level libraries with an order of magnitude less code, (2) achieve shuffle performance and scalability competitive with monolithic shuffle systems, and break the CloudSort record as the world's most cost-efficient sorting system, and (3) enable new applications such as ML training to easily leverage scalable shuffle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.05072v5-abstract-full').style.display = 'none'; document.getElementById('2203.05072v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.00674">arXiv:2104.00674</a> <span> [<a href="https://arxiv.org/pdf/2104.00674">pdf</a>, <a href="https://arxiv.org/format/2104.00674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> PhySG: Inverse Rendering with Spherical Gaussians for Physics-based Material Editing and Relighting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianqian Wang</a>, <a href="/search/cs?searchtype=author&query=Bala%2C+K">Kavita Bala</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.00674v1-abstract-short" style="display: inline;"> We present PhySG, an end-to-end inverse rendering pipeline that includes a fully differentiable renderer and can reconstruct geometry, materials, and illumination from scratch from a set of RGB input images. Our framework represents specular BRDFs and environmental illumination using mixtures of spherical Gaussians, and represents geometry as a signed distance function parameterized as a Multi-Lay… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.00674v1-abstract-full').style.display = 'inline'; document.getElementById('2104.00674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.00674v1-abstract-full" style="display: none;"> We present PhySG, an end-to-end inverse rendering pipeline that includes a fully differentiable renderer and can reconstruct geometry, materials, and illumination from scratch from a set of RGB input images. Our framework represents specular BRDFs and environmental illumination using mixtures of spherical Gaussians, and represents geometry as a signed distance function parameterized as a Multi-Layer Perceptron. The use of spherical Gaussians allows us to efficiently solve for approximate light transport, and our method works on scenes with challenging non-Lambertian reflectance captured under natural, static illumination. We demonstrate, with both synthetic and real data, that our reconstructions not only enable rendering of novel viewpoints, but also physics-based appearance editing of materials and illumination. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.00674v1-abstract-full').style.display = 'none'; document.getElementById('2104.00674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2021; Project page: https://kai-46.github.io/PhySG-website/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.15208">arXiv:2103.15208</a> <span> [<a href="https://arxiv.org/pdf/2103.15208">pdf</a>, <a href="https://arxiv.org/format/2103.15208">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unified Shape and SVBRDF Recovery using Differentiable Monte Carlo Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuang Zhao</a>, <a href="/search/cs?searchtype=author&query=Bala%2C+K">Kavita Bala</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhao Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.15208v3-abstract-short" style="display: inline;"> Reconstructing the shape and appearance of real-world objects using measured 2D images has been a long-standing problem in computer vision. In this paper, we introduce a new analysis-by-synthesis technique capable of producing high-quality reconstructions through robust coarse-to-fine optimization and physics-based differentiable rendering. Unlike most previous methods that handle geometry and r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.15208v3-abstract-full').style.display = 'inline'; document.getElementById('2103.15208v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.15208v3-abstract-full" style="display: none;"> Reconstructing the shape and appearance of real-world objects using measured 2D images has been a long-standing problem in computer vision. In this paper, we introduce a new analysis-by-synthesis technique capable of producing high-quality reconstructions through robust coarse-to-fine optimization and physics-based differentiable rendering. Unlike most previous methods that handle geometry and reflectance largely separately, our method unifies the optimization of both by leveraging image gradients with respect to both object reflectance and geometry. To obtain physically accurate gradient estimates, we develop a new GPU-based Monte Carlo differentiable renderer leveraging recent advances in differentiable rendering theory to offer unbiased gradients while enjoying better performance than existing tools like PyTorch3D and redner. To further improve robustness, we utilize several shape and material priors as well as a coarse-to-fine optimization strategy to reconstruct geometry. We demonstrate that our technique can produce reconstructions with higher quality than previous methods such as COLMAP and Kinect Fusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.15208v3-abstract-full').style.display = 'none'; document.getElementById('2103.15208v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.11230">arXiv:1812.11230</a> <span> [<a href="https://arxiv.org/pdf/1812.11230">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Other Computer Science">cs.OH</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1088/1757-899X/399/1/012024">10.1088/1757-899X/399/1/012024 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> An intelligent household greenhouse system design based on Internet of Things </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhonghua Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenbo Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shuo Lin</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fangjun Luan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.11230v1-abstract-short" style="display: inline;"> In order to combine indoor greenery conservation with Internet of Things (IOT) Technologies, this paper designs an intelligent household greenhouse project with the features of comprehensive sensing, reliable transmission and intelligent processing. Through the analysis of functional requirements of the intelligent household greenhouse system, an intelligent household greenhouse system is designed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.11230v1-abstract-full').style.display = 'inline'; document.getElementById('1812.11230v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.11230v1-abstract-full" style="display: none;"> In order to combine indoor greenery conservation with Internet of Things (IOT) Technologies, this paper designs an intelligent household greenhouse project with the features of comprehensive sensing, reliable transmission and intelligent processing. Through the analysis of functional requirements of the intelligent household greenhouse system, an intelligent household greenhouse system is designed with the functions of greenhouse environmental data detection, greenhouse environmental control regulation, data remote transmission and human-computer interaction. Its sensor layer collects environmental data in real time based on the ZigBee wireless sensor network. The network layer STM32 intelligent gateway coordinates with network server, so as to exchange data from sensor layer to application layer, and solve the problems of non-blocking of data sending and receiving as well as concurrent requests of multiple mobile terminals. The application layer is designed into two types. One is a desktop management system as a data storage and analysis center, and the other is a mobile terminal APP. At the same time, we design a communication protocol that is applicable to the interaction of the three-layer structure of the Internet of Things, with the characteristics of simplicity, stability, readability, and scalability. It can avoid the mutual influence of multi-level data exchange and ensure the correctness of data circulation. In the design, the system sensor layer ensures stable transmission of various data and instructions, and the network layer has a high degree of concurrency and real time. And various measurement and control data of the sensor layer can interact with the data of mobile-terminal equipment of the application layer. The desktop management system and mobile terminal APP can monitor greenhouse data in real time and control various actuators in the greenhouse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.11230v1-abstract-full').style.display = 'none'; document.getElementById('1812.11230v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1809.10820">arXiv:1809.10820</a> <span> [<a href="https://arxiv.org/pdf/1809.10820">pdf</a>, <a href="https://arxiv.org/format/1809.10820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Inverse Transport Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Che%2C+C">Chengqian Che</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuang Zhao</a>, <a href="/search/cs?searchtype=author&query=Bala%2C+K">Kavita Bala</a>, <a href="/search/cs?searchtype=author&query=Gkioulekas%2C+I">Ioannis Gkioulekas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1809.10820v1-abstract-short" style="display: inline;"> We introduce inverse transport networks as a learning architecture for inverse rendering problems where, given input image measurements, we seek to infer physical scene parameters such as shape, material, and illumination. During training, these networks are evaluated not only in terms of how close they can predict groundtruth parameters, but also in terms of whether the parameters they produce ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.10820v1-abstract-full').style.display = 'inline'; document.getElementById('1809.10820v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1809.10820v1-abstract-full" style="display: none;"> We introduce inverse transport networks as a learning architecture for inverse rendering problems where, given input image measurements, we seek to infer physical scene parameters such as shape, material, and illumination. During training, these networks are evaluated not only in terms of how close they can predict groundtruth parameters, but also in terms of whether the parameters they produce can be used, together with physically-accurate graphics renderers, to reproduce the input image measurements. To en- able training of inverse transport networks using stochastic gradient descent, we additionally create a general-purpose, physically-accurate differentiable renderer, which can be used to estimate derivatives of images with respect to arbitrary physical scene parameters. Our experiments demonstrate that inverse transport networks can be trained efficiently using differentiable rendering, and that they generalize to scenes with completely unseen geometry and illumination better than networks trained without appearance- matching regularization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.10820v1-abstract-full').style.display = 'none'; document.getElementById('1809.10820v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.03189">arXiv:1804.03189</a> <span> [<a href="https://arxiv.org/pdf/1804.03189">pdf</a>, <a href="https://arxiv.org/format/1804.03189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Deep Painterly Harmonization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Paris%2C+S">Sylvain Paris</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Bala%2C+K">Kavita Bala</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.03189v4-abstract-short" style="display: inline;"> Copying an element from a photo and pasting it into a painting is a challenging task. Applying photo compositing techniques in this context yields subpar results that look like a collage --- and existing painterly stylization algorithms, which are global, perform poorly when applied locally. We address these issues with a dedicated algorithm that carefully determines the local statistics to be tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.03189v4-abstract-full').style.display = 'inline'; document.getElementById('1804.03189v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.03189v4-abstract-full" style="display: none;"> Copying an element from a photo and pasting it into a painting is a challenging task. Applying photo compositing techniques in this context yields subpar results that look like a collage --- and existing painterly stylization algorithms, which are global, perform poorly when applied locally. We address these issues with a dedicated algorithm that carefully determines the local statistics to be transferred. We ensure both spatial and inter-scale statistical consistency and demonstrate that both aspects are key to generating quality results. To cope with the diversity of abstraction levels and types of paintings, we introduce a technique to adjust the parameters of the transfer depending on the painting. We show that our algorithm produces significantly better results than photo compositing or global stylization techniques and that it enables creative painterly edits that would be otherwise difficult to achieve. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.03189v4-abstract-full').style.display = 'none'; document.getElementById('1804.03189v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1703.07511">arXiv:1703.07511</a> <span> [<a href="https://arxiv.org/pdf/1703.07511">pdf</a>, <a href="https://arxiv.org/format/1703.07511">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep Photo Style Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Paris%2C+S">Sylvain Paris</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Bala%2C+K">Kavita Bala</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1703.07511v3-abstract-short" style="display: inline;"> This paper introduces a deep-learning approach to photographic style transfer that handles a large variety of image content while faithfully transferring the reference style. Our approach builds upon the recent work on painterly transfer that separates style from the content of an image by considering different layers of a neural network. However, as is, this approach is not suitable for photoreal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.07511v3-abstract-full').style.display = 'inline'; document.getElementById('1703.07511v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1703.07511v3-abstract-full" style="display: none;"> This paper introduces a deep-learning approach to photographic style transfer that handles a large variety of image content while faithfully transferring the reference style. Our approach builds upon the recent work on painterly transfer that separates style from the content of an image by considering different layers of a neural network. However, as is, this approach is not suitable for photorealistic style transfer. Even when both the input and reference images are photographs, the output still exhibits distortions reminiscent of a painting. Our contribution is to constrain the transformation from the input to the output to be locally affine in colorspace, and to express this constraint as a custom fully differentiable energy term. We show that this approach successfully suppresses distortion and yields satisfying photorealistic style transfers in a broad variety of scenarios, including transfer of the time of day, weather, season, and artistic edits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.07511v3-abstract-full').style.display = 'none'; document.getElementById('1703.07511v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2017. </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository