Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 99 results for author: <span class="mathjax">Shechtman, E</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Shechtman%2C+E">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Shechtman, E"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Shechtman%2C+E&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Shechtman, E"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shechtman%2C+E&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shechtman%2C+E&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shechtman%2C+E&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01639">arXiv:2502.01639</a> <span> [<a href="https://arxiv.org/pdf/2502.01639">pdf</a>, <a href="https://arxiv.org/format/2502.01639">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SliderSpace: Decomposing the Visual Capabilities of Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gandikota%2C+R">Rohit Gandikota</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zongze Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Bau%2C+D">David Bau</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Kolkin%2C+N">Nick Kolkin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01639v1-abstract-short" style="display: inline;"> We present SliderSpace, a framework for automatically decomposing the visual capabilities of diffusion models into controllable and human-understandable directions. Unlike existing control methods that require a user to specify attributes for each edit direction individually, SliderSpace discovers multiple interpretable and diverse directions simultaneously from a single text prompt. Each directio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01639v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01639v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01639v1-abstract-full" style="display: none;"> We present SliderSpace, a framework for automatically decomposing the visual capabilities of diffusion models into controllable and human-understandable directions. Unlike existing control methods that require a user to specify attributes for each edit direction individually, SliderSpace discovers multiple interpretable and diverse directions simultaneously from a single text prompt. Each direction is trained as a low-rank adaptor, enabling compositional control and the discovery of surprising possibilities in the model's latent space. Through extensive experiments on state-of-the-art diffusion models, we demonstrate SliderSpace's effectiveness across three applications: concept decomposition, artistic style exploration, and diversity enhancement. Our quantitative evaluation shows that SliderSpace-discovered directions decompose the visual structure of model's knowledge effectively, offering insights into the latent capabilities encoded within diffusion models. User studies further validate that our method produces more diverse and useful variations compared to baselines. Our code, data and trained weights are available at https://sliderspace.baulab.info <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01639v1-abstract-full').style.display = 'none'; document.getElementById('2502.01639v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Website: https://sliderspace.baulab.info</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16822">arXiv:2412.16822</a> <span> [<a href="https://arxiv.org/pdf/2412.16822">pdf</a>, <a href="https://arxiv.org/format/2412.16822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Layer- and Timestep-Adaptive Differentiable Token Compression Ratios for Efficient Diffusion Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=You%2C+H">Haoran You</a>, <a href="/search/cs?searchtype=author&query=Barnes%2C+C">Connelly Barnes</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yan Kang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhenbang Du</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wei Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lingzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Nitzan%2C+Y">Yotam Nitzan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyang Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Amirghodsi%2C+S">Sohrab Amirghodsi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y+C">Yingyan Celine Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16822v1-abstract-short" style="display: inline;"> Diffusion Transformers (DiTs) have achieved state-of-the-art (SOTA) image generation quality but suffer from high latency and memory inefficiency, making them difficult to deploy on resource-constrained devices. One key efficiency bottleneck is that existing DiTs apply equal computation across all regions of an image. However, not all image tokens are equally important, and certain localized areas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16822v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16822v1-abstract-full" style="display: none;"> Diffusion Transformers (DiTs) have achieved state-of-the-art (SOTA) image generation quality but suffer from high latency and memory inefficiency, making them difficult to deploy on resource-constrained devices. One key efficiency bottleneck is that existing DiTs apply equal computation across all regions of an image. However, not all image tokens are equally important, and certain localized areas require more computation, such as objects. To address this, we propose DiffRatio-MoD, a dynamic DiT inference framework with differentiable compression ratios, which automatically learns to dynamically route computation across layers and timesteps for each image token, resulting in Mixture-of-Depths (MoD) efficient DiT models. Specifically, DiffRatio-MoD integrates three features: (1) A token-level routing scheme where each DiT layer includes a router that is jointly fine-tuned with model weights to predict token importance scores. In this way, unimportant tokens bypass the entire layer's computation; (2) A layer-wise differentiable ratio mechanism where different DiT layers automatically learn varying compression ratios from a zero initialization, resulting in large compression ratios in redundant layers while others remain less compressed or even uncompressed; (3) A timestep-wise differentiable ratio mechanism where each denoising timestep learns its own compression ratio. The resulting pattern shows higher ratios for noisier timesteps and lower ratios as the image becomes clearer. Extensive experiments on both text-to-image and inpainting tasks show that DiffRatio-MoD effectively captures dynamism across token, layer, and timestep axes, achieving superior trade-offs between generation quality and efficiency compared to prior works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16822v1-abstract-full').style.display = 'none'; document.getElementById('2412.16822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 13 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07772">arXiv:2412.07772</a> <span> [<a href="https://arxiv.org/pdf/2412.07772">pdf</a>, <a href="https://arxiv.org/format/2412.07772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From Slow Bidirectional to Fast Autoregressive Video Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+T">Tianwei Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Freeman%2C+W+T">William T. Freeman</a>, <a href="/search/cs?searchtype=author&query=Durand%2C+F">Fredo Durand</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xun Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07772v2-abstract-short" style="display: inline;"> Current video diffusion models achieve impressive generation quality but struggle in interactive applications due to bidirectional attention dependencies. The generation of a single frame requires the model to process the entire sequence, including the future. We address this limitation by adapting a pretrained bidirectional diffusion transformer to an autoregressive transformer that generates fra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07772v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07772v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07772v2-abstract-full" style="display: none;"> Current video diffusion models achieve impressive generation quality but struggle in interactive applications due to bidirectional attention dependencies. The generation of a single frame requires the model to process the entire sequence, including the future. We address this limitation by adapting a pretrained bidirectional diffusion transformer to an autoregressive transformer that generates frames on-the-fly. To further reduce latency, we extend distribution matching distillation (DMD) to videos, distilling 50-step diffusion model into a 4-step generator. To enable stable and high-quality distillation, we introduce a student initialization scheme based on teacher's ODE trajectories, as well as an asymmetric distillation strategy that supervises a causal student model with a bidirectional teacher. This approach effectively mitigates error accumulation in autoregressive generation, allowing long-duration video synthesis despite training on short clips. Our model achieves a total score of 84.27 on the VBench-Long benchmark, surpassing all previous video generation models. It enables fast streaming generation of high-quality videos at 9.4 FPS on a single GPU thanks to KV caching. Our approach also enables streaming video-to-video translation, image-to-video, and dynamic prompting in a zero-shot manner. We will release the code based on an open-source model in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07772v2-abstract-full').style.display = 'none'; document.getElementById('2412.07772v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://causvid.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00905">arXiv:2410.00905</a> <span> [<a href="https://arxiv.org/pdf/2410.00905">pdf</a>, <a href="https://arxiv.org/format/2410.00905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Removing Distributional Discrepancies in Captions Improves Image-Text Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuheng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haotian Liu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+M">Mu Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijun Li</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y+J">Yong Jae Lee</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+K+K">Krishna Kumar Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00905v1-abstract-short" style="display: inline;"> In this paper, we introduce a model designed to improve the prediction of image-text alignment, targeting the challenge of compositional understanding in current visual-language models. Our approach focuses on generating high-quality training datasets for the alignment task by producing mixed-type negative captions derived from positive ones. Critically, we address the distribution imbalance betwe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00905v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00905v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00905v1-abstract-full" style="display: none;"> In this paper, we introduce a model designed to improve the prediction of image-text alignment, targeting the challenge of compositional understanding in current visual-language models. Our approach focuses on generating high-quality training datasets for the alignment task by producing mixed-type negative captions derived from positive ones. Critically, we address the distribution imbalance between positive and negative captions to ensure that the alignment model does not depend solely on textual information but also considers the associated images for predicting alignment accurately. By creating this enhanced training data, we fine-tune an existing leading visual-language model to boost its capability in understanding alignment. Our model significantly outperforms current top-performing methods across various datasets. We also demonstrate the applicability of our model by ranking the images generated by text-to-image models based on text alignment. Project page: \url{https://yuheng-li.github.io/LLaVA-score/} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00905v1-abstract-full').style.display = 'none'; document.getElementById('2410.00905v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08332">arXiv:2408.08332</a> <span> [<a href="https://arxiv.org/pdf/2408.08332">pdf</a>, <a href="https://arxiv.org/format/2408.08332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TurboEdit: Instant text-based image editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zongze Wu</a>, <a href="/search/cs?searchtype=author&query=Kolkin%2C+N">Nicholas Kolkin</a>, <a href="/search/cs?searchtype=author&query=Brandt%2C+J">Jonathan Brandt</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08332v1-abstract-short" style="display: inline;"> We address the challenges of precise image inversion and disentangled image editing in the context of few-step diffusion models. We introduce an encoder based iterative inversion technique. The inversion network is conditioned on the input image and the reconstructed image from the previous step, allowing for correction of the next reconstruction towards the input image. We demonstrate that disent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08332v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08332v1-abstract-full" style="display: none;"> We address the challenges of precise image inversion and disentangled image editing in the context of few-step diffusion models. We introduce an encoder based iterative inversion technique. The inversion network is conditioned on the input image and the reconstructed image from the previous step, allowing for correction of the next reconstruction towards the input image. We demonstrate that disentangled controls can be easily achieved in the few-step diffusion model by conditioning on an (automatically generated) detailed text prompt. To manipulate the inverted image, we freeze the noise maps and modify one attribute in the text prompt (either manually or via instruction based editing driven by an LLM), resulting in the generation of a new image similar to the input image with only one attribute changed. It can further control the editing strength and accept instructive text prompt. Our approach facilitates realistic text-guided image edits in real-time, requiring only 8 number of functional evaluations (NFEs) in inversion (one-time cost) and 4 NFEs per edit. Our method is not only fast, but also significantly outperforms state-of-the-art multi-step diffusion editing techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08332v1-abstract-full').style.display = 'none'; document.getElementById('2408.08332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to European Conference on Computer Vision (ECCV), 2024. Project page: https://betterze.github.io/TurboEdit/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07480">arXiv:2406.07480</a> <span> [<a href="https://arxiv.org/pdf/2406.07480">pdf</a>, <a href="https://arxiv.org/format/2406.07480">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Image Neural Field Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yinbo Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+O">Oliver Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a>, <a href="/search/cs?searchtype=author&query=Gharbi%2C+M">Michael Gharbi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07480v1-abstract-short" style="display: inline;"> Diffusion models have shown an impressive ability to model complex data distributions, with several key advantages over GANs, such as stable training, better coverage of the training distribution's modes, and the ability to solve inverse problems without extra training. However, most diffusion models learn the distribution of fixed-resolution images. We propose to learn the distribution of continu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07480v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07480v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07480v1-abstract-full" style="display: none;"> Diffusion models have shown an impressive ability to model complex data distributions, with several key advantages over GANs, such as stable training, better coverage of the training distribution's modes, and the ability to solve inverse problems without extra training. However, most diffusion models learn the distribution of fixed-resolution images. We propose to learn the distribution of continuous images by training diffusion models on image neural fields, which can be rendered at any resolution, and show its advantages over fixed-resolution models. To achieve this, a key challenge is to obtain a latent space that represents photorealistic image neural fields. We propose a simple and effective method, inspired by several recent techniques but with key changes to make the image neural fields photorealistic. Our method can be used to convert existing latent diffusion autoencoders into image neural field autoencoders. We show that image neural field diffusion models can be trained using mixed-resolution image datasets, outperform fixed-resolution diffusion models followed by super-resolution models, and can solve inverse problems with conditions applied at different scales efficiently. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07480v1-abstract-full').style.display = 'none'; document.getElementById('2406.07480v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://yinboc.github.io/infd/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14867">arXiv:2405.14867</a> <span> [<a href="https://arxiv.org/pdf/2405.14867">pdf</a>, <a href="https://arxiv.org/format/2405.14867">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improved Distribution Matching Distillation for Fast Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+T">Tianwei Yin</a>, <a href="/search/cs?searchtype=author&query=Gharbi%2C+M">Micha毛l Gharbi</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taesung Park</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Durand%2C+F">Fredo Durand</a>, <a href="/search/cs?searchtype=author&query=Freeman%2C+W+T">William T. Freeman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14867v2-abstract-short" style="display: inline;"> Recent approaches have shown promises distilling diffusion models into efficient one-step generators. Among them, Distribution Matching Distillation (DMD) produces one-step generators that match their teacher in distribution, without enforcing a one-to-one correspondence with the sampling trajectories of their teachers. However, to ensure stable training, DMD requires an additional regression loss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14867v2-abstract-full').style.display = 'inline'; document.getElementById('2405.14867v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14867v2-abstract-full" style="display: none;"> Recent approaches have shown promises distilling diffusion models into efficient one-step generators. Among them, Distribution Matching Distillation (DMD) produces one-step generators that match their teacher in distribution, without enforcing a one-to-one correspondence with the sampling trajectories of their teachers. However, to ensure stable training, DMD requires an additional regression loss computed using a large set of noise-image pairs generated by the teacher with many steps of a deterministic sampler. This is costly for large-scale text-to-image synthesis and limits the student's quality, tying it too closely to the teacher's original sampling paths. We introduce DMD2, a set of techniques that lift this limitation and improve DMD training. First, we eliminate the regression loss and the need for expensive dataset construction. We show that the resulting instability is due to the fake critic not estimating the distribution of generated samples accurately and propose a two time-scale update rule as a remedy. Second, we integrate a GAN loss into the distillation procedure, discriminating between generated samples and real images. This lets us train the student model on real data, mitigating the imperfect real score estimation from the teacher model, and enhancing quality. Lastly, we modify the training procedure to enable multi-step sampling. We identify and address the training-inference input mismatch problem in this setting, by simulating inference-time generator samples during training time. Taken together, our improvements set new benchmarks in one-step image generation, with FID scores of 1.28 on ImageNet-64x64 and 8.35 on zero-shot COCO 2014, surpassing the original teacher despite a 500X reduction in inference cost. Further, we show our approach can generate megapixel images by distilling SDXL, demonstrating exceptional visual quality among few-step methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14867v2-abstract-full').style.display = 'none'; document.getElementById('2405.14867v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code, model, and dataset are available at https://tianweiy.github.io/dmd2</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05967">arXiv:2405.05967</a> <span> [<a href="https://arxiv.org/pdf/2405.05967">pdf</a>, <a href="https://arxiv.org/format/2405.05967">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Distilling Diffusion Models into Conditional GANs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+M">Minguk Kang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Barnes%2C+C">Connelly Barnes</a>, <a href="/search/cs?searchtype=author&query=Paris%2C+S">Sylvain Paris</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+S">Suha Kwak</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaesik Park</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun-Yan Zhu</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taesung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05967v3-abstract-short" style="display: inline;"> We propose a method to distill a complex multistep diffusion model into a single-step conditional GAN student model, dramatically accelerating inference, while preserving image quality. Our approach interprets diffusion distillation as a paired image-to-image translation task, using noise-to-image pairs of the diffusion model's ODE trajectory. For efficient regression loss computation, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05967v3-abstract-full').style.display = 'inline'; document.getElementById('2405.05967v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05967v3-abstract-full" style="display: none;"> We propose a method to distill a complex multistep diffusion model into a single-step conditional GAN student model, dramatically accelerating inference, while preserving image quality. Our approach interprets diffusion distillation as a paired image-to-image translation task, using noise-to-image pairs of the diffusion model's ODE trajectory. For efficient regression loss computation, we propose E-LatentLPIPS, a perceptual loss operating directly in diffusion model's latent space, utilizing an ensemble of augmentations. Furthermore, we adapt a diffusion model to construct a multi-scale discriminator with a text alignment loss to build an effective conditional GAN-based formulation. E-LatentLPIPS converges more efficiently than many existing distillation methods, even accounting for dataset construction costs. We demonstrate that our one-step generator outperforms cutting-edge one-step diffusion distillation models -- DMD, SDXL-Turbo, and SDXL-Lightning -- on the zero-shot COCO benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05967v3-abstract-full').style.display = 'none'; document.getElementById('2405.05967v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://mingukkang.github.io/Diffusion2GAN/ (ECCV2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16029">arXiv:2404.16029</a> <span> [<a href="https://arxiv.org/pdf/2404.16029">pdf</a>, <a href="https://arxiv.org/format/2404.16029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Editable Image Elements for Controllable Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mu%2C+J">Jiteng Mu</a>, <a href="/search/cs?searchtype=author&query=Gharbi%2C+M">Micha毛l Gharbi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Vasconcelos%2C+N">Nuno Vasconcelos</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taesung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16029v1-abstract-short" style="display: inline;"> Diffusion models have made significant advances in text-guided synthesis tasks. However, editing user-provided images remains challenging, as the high dimensional noise input space of diffusion models is not naturally suited for image inversion or spatial editing. In this work, we propose an image representation that promotes spatial editing of input images using a diffusion model. Concretely, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16029v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16029v1-abstract-full" style="display: none;"> Diffusion models have made significant advances in text-guided synthesis tasks. However, editing user-provided images remains challenging, as the high dimensional noise input space of diffusion models is not naturally suited for image inversion or spatial editing. In this work, we propose an image representation that promotes spatial editing of input images using a diffusion model. Concretely, we learn to encode an input into "image elements" that can faithfully reconstruct an input image. These elements can be intuitively edited by a user, and are decoded by a diffusion model into realistic images. We show the effectiveness of our representation on various image editing tasks, such as object resizing, rearrangement, dragging, de-occlusion, removal, variation, and image composition. Project page: https://jitengmu.github.io/Editable_Image_Elements/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16029v1-abstract-full').style.display = 'none'; document.getElementById('2404.16029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://jitengmu.github.io/Editable_Image_Elements/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.12388">arXiv:2404.12388</a> <span> [<a href="https://arxiv.org/pdf/2404.12388">pdf</a>, <a href="https://arxiv.org/format/2404.12388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VideoGigaGAN: Towards Detail-rich Video Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yiran Xu</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taesung Park</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jia-Bin Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Difan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.12388v2-abstract-short" style="display: inline;"> Video super-resolution (VSR) approaches have shown impressive temporal consistency in upsampled videos. However, these approaches tend to generate blurrier results than their image counterparts as they are limited in their generative capability. This raises a fundamental question: can we extend the success of a generative image upsampler to the VSR task while preserving the temporal consistency? W… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12388v2-abstract-full').style.display = 'inline'; document.getElementById('2404.12388v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.12388v2-abstract-full" style="display: none;"> Video super-resolution (VSR) approaches have shown impressive temporal consistency in upsampled videos. However, these approaches tend to generate blurrier results than their image counterparts as they are limited in their generative capability. This raises a fundamental question: can we extend the success of a generative image upsampler to the VSR task while preserving the temporal consistency? We introduce VideoGigaGAN, a new generative VSR model that can produce videos with high-frequency details and temporal consistency. VideoGigaGAN builds upon a large-scale image upsampler -- GigaGAN. Simply inflating GigaGAN to a video model by adding temporal modules produces severe temporal flickering. We identify several key issues and propose techniques that significantly improve the temporal consistency of upsampled videos. Our experiments show that, unlike previous VSR methods, VideoGigaGAN generates temporally consistent videos with more fine-grained appearance details. We validate the effectiveness of VideoGigaGAN by comparing it with state-of-the-art VSR models on public datasets and showcasing video results with $8\times$ super-resolution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12388v2-abstract-full').style.display = 'none'; document.getElementById('2404.12388v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://videogigagan.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.12382">arXiv:2404.12382</a> <span> [<a href="https://arxiv.org/pdf/2404.12382">pdf</a>, <a href="https://arxiv.org/format/2404.12382">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Lazy Diffusion Transformer for Interactive Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nitzan%2C+Y">Yotam Nitzan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zongze Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Cohen-Or%2C+D">Daniel Cohen-Or</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taesung Park</a>, <a href="/search/cs?searchtype=author&query=Gharbi%2C+M">Micha毛l Gharbi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.12382v1-abstract-short" style="display: inline;"> We introduce a novel diffusion transformer, LazyDiffusion, that generates partial image updates efficiently. Our approach targets interactive image editing applications in which, starting from a blank canvas or an image, a user specifies a sequence of localized image modifications using binary masks and text prompts. Our generator operates in two phases. First, a context encoder processes the curr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12382v1-abstract-full').style.display = 'inline'; document.getElementById('2404.12382v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.12382v1-abstract-full" style="display: none;"> We introduce a novel diffusion transformer, LazyDiffusion, that generates partial image updates efficiently. Our approach targets interactive image editing applications in which, starting from a blank canvas or an image, a user specifies a sequence of localized image modifications using binary masks and text prompts. Our generator operates in two phases. First, a context encoder processes the current canvas and user mask to produce a compact global context tailored to the region to generate. Second, conditioned on this context, a diffusion-based transformer decoder synthesizes the masked pixels in a "lazy" fashion, i.e., it only generates the masked region. This contrasts with previous works that either regenerate the full canvas, wasting time and computation, or confine processing to a tight rectangular crop around the mask, ignoring the global image context altogether. Our decoder's runtime scales with the mask size, which is typically small, while our encoder introduces negligible overhead. We demonstrate that our approach is competitive with state-of-the-art inpainting methods in terms of quality and fidelity while providing a 10x speedup for typical user interactions, where the editing mask represents 10% of the image. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12382v1-abstract-full').style.display = 'none'; document.getElementById('2404.12382v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.12333">arXiv:2404.12333</a> <span> [<a href="https://arxiv.org/pdf/2404.12333">pdf</a>, <a href="https://arxiv.org/format/2404.12333">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Customizing Text-to-Image Diffusion with Object Viewpoint Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumari%2C+N">Nupur Kumari</a>, <a href="/search/cs?searchtype=author&query=Su%2C+G">Grace Su</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taesung Park</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun-Yan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.12333v2-abstract-short" style="display: inline;"> Model customization introduces new concepts to existing text-to-image models, enabling the generation of these new concepts/objects in novel contexts. However, such methods lack accurate camera view control with respect to the new object, and users must resort to prompt engineering (e.g., adding ``top-view'') to achieve coarse view control. In this work, we introduce a new task -- enabling explici… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12333v2-abstract-full').style.display = 'inline'; document.getElementById('2404.12333v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.12333v2-abstract-full" style="display: none;"> Model customization introduces new concepts to existing text-to-image models, enabling the generation of these new concepts/objects in novel contexts. However, such methods lack accurate camera view control with respect to the new object, and users must resort to prompt engineering (e.g., adding ``top-view'') to achieve coarse view control. In this work, we introduce a new task -- enabling explicit control of the object viewpoint in the customization of text-to-image diffusion models. This allows us to modify the custom object's properties and generate it in various background scenes via text prompts, all while incorporating the object viewpoint as an additional control. This new task presents significant challenges, as one must harmoniously merge a 3D representation from the multi-view images with the 2D pre-trained model. To bridge this gap, we propose to condition the diffusion process on the 3D object features rendered from the target viewpoint. During training, we fine-tune the 3D feature prediction modules to reconstruct the object's appearance and geometry, while reducing overfitting to the input multi-view images. Our method outperforms existing image editing and model customization baselines in preserving the custom object's identity while following the target object viewpoint and the text prompt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12333v2-abstract-full').style.display = 'none'; document.getElementById('2404.12333v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://customdiffusion360.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13044">arXiv:2403.13044</a> <span> [<a href="https://arxiv.org/pdf/2403.13044">pdf</a>, <a href="https://arxiv.org/format/2403.13044">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Magic Fixup: Streamlining Photo Editing by Watching Dynamic Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Alzayer%2C+H">Hadi Alzayer</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhihao Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuaner Zhang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jia-Bin Huang</a>, <a href="/search/cs?searchtype=author&query=Gharbi%2C+M">Michael Gharbi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13044v1-abstract-short" style="display: inline;"> We propose a generative model that, given a coarsely edited image, synthesizes a photorealistic output that follows the prescribed layout. Our method transfers fine details from the original image and preserves the identity of its parts. Yet, it adapts it to the lighting and context defined by the new layout. Our key insight is that videos are a powerful source of supervision for this task: object… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13044v1-abstract-full').style.display = 'inline'; document.getElementById('2403.13044v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13044v1-abstract-full" style="display: none;"> We propose a generative model that, given a coarsely edited image, synthesizes a photorealistic output that follows the prescribed layout. Our method transfers fine details from the original image and preserves the identity of its parts. Yet, it adapts it to the lighting and context defined by the new layout. Our key insight is that videos are a powerful source of supervision for this task: objects and camera motions provide many observations of how the world changes with viewpoint, lighting, and physical interactions. We construct an image dataset in which each sample is a pair of source and target frames extracted from the same video at randomly chosen time intervals. We warp the source frame toward the target using two motion models that mimic the expected test-time user edits. We supervise our model to translate the warped image into the ground truth, starting from a pretrained diffusion model. Our model design explicitly enables fine detail transfer from the source frame to the generated image, while closely following the user-specified layout. We show that by using simple segmentations and coarse 2D manipulations, we can synthesize a photorealistic edit faithful to the user's input while addressing second-order effects like harmonizing the lighting and physical interactions between edited objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13044v1-abstract-full').style.display = 'none'; document.getElementById('2403.13044v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://magic-fixup.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.04718">arXiv:2401.04718</a> <span> [<a href="https://arxiv.org/pdf/2401.04718">pdf</a>, <a href="https://arxiv.org/format/2401.04718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Jump Cut Smoothing for Talking Heads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaojuan Wang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taesung Park</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.04718v2-abstract-short" style="display: inline;"> A jump cut offers an abrupt, sometimes unwanted change in the viewing experience. We present a novel framework for smoothing these jump cuts, in the context of talking head videos. We leverage the appearance of the subject from the other source frames in the video, fusing it with a mid-level representation driven by DensePose keypoints and face landmarks. To achieve motion, we interpolate the keyp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04718v2-abstract-full').style.display = 'inline'; document.getElementById('2401.04718v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.04718v2-abstract-full" style="display: none;"> A jump cut offers an abrupt, sometimes unwanted change in the viewing experience. We present a novel framework for smoothing these jump cuts, in the context of talking head videos. We leverage the appearance of the subject from the other source frames in the video, fusing it with a mid-level representation driven by DensePose keypoints and face landmarks. To achieve motion, we interpolate the keypoints and landmarks between the end frames around the cut. We then use an image translation network from the keypoints and source frames, to synthesize pixels. Because keypoints can contain errors, we propose a cross-modal attention scheme to select and pick the most appropriate source amongst multiple options for each key point. By leveraging this mid-level representation, our method can achieve stronger results than a strong video interpolation baseline. We demonstrate our method on various jump cuts in the talking head videos, such as cutting filler words, pauses, and even random cuts. Our experiments show that we can achieve seamless transitions, even in the challenging cases where the talking head rotates or moves drastically in the jump cut. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04718v2-abstract-full').style.display = 'none'; document.getElementById('2401.04718v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Correct typos in the caption of Figure 1; Change the project website address. Project page: https://jeanne-wang.github.io/jumpcutsmoothing/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04966">arXiv:2312.04966</a> <span> [<a href="https://arxiv.org/pdf/2312.04966">pdf</a>, <a href="https://arxiv.org/format/2312.04966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NewMove: Customizing text-to-video models with novel motions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Materzynska%2C+J">Joanna Materzynska</a>, <a href="/search/cs?searchtype=author&query=Sivic%2C+J">Josef Sivic</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Torralba%2C+A">Antonio Torralba</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+B">Bryan Russell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04966v2-abstract-short" style="display: inline;"> We introduce an approach for augmenting text-to-video generation models with customized motions, extending their capabilities beyond the motions depicted in the original training data. By leveraging a few video samples demonstrating specific movements as input, our method learns and generalizes the input motion patterns for diverse, text-specified scenarios. Our contributions are threefold. First,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04966v2-abstract-full').style.display = 'inline'; document.getElementById('2312.04966v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04966v2-abstract-full" style="display: none;"> We introduce an approach for augmenting text-to-video generation models with customized motions, extending their capabilities beyond the motions depicted in the original training data. By leveraging a few video samples demonstrating specific movements as input, our method learns and generalizes the input motion patterns for diverse, text-specified scenarios. Our contributions are threefold. First, to achieve our results, we finetune an existing text-to-video model to learn a novel mapping between the depicted motion in the input examples to a new unique token. To avoid overfitting to the new custom motion, we introduce an approach for regularization over videos. Second, by leveraging the motion priors in a pretrained model, our method can produce novel videos featuring multiple people doing the custom motion, and can invoke the motion in combination with other motions. Furthermore, our approach extends to the multimodal customization of motion and appearance of individualized subjects, enabling the generation of videos featuring unique characters and distinct motions. Third, to validate our method, we introduce an approach for quantitatively evaluating the learned custom motion and perform a systematic ablation study. We show that our method significantly outperforms prior appearance-based customization approaches when extended to the motion customization task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04966v2-abstract-full').style.display = 'none'; document.getElementById('2312.04966v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: this website https://joaanna.github.io/customizing_motion/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.18828">arXiv:2311.18828</a> <span> [<a href="https://arxiv.org/pdf/2311.18828">pdf</a>, <a href="https://arxiv.org/format/2311.18828">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> One-step Diffusion with Distribution Matching Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+T">Tianwei Yin</a>, <a href="/search/cs?searchtype=author&query=Gharbi%2C+M">Micha毛l Gharbi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Durand%2C+F">Fredo Durand</a>, <a href="/search/cs?searchtype=author&query=Freeman%2C+W+T">William T. Freeman</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taesung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18828v4-abstract-short" style="display: inline;"> Diffusion models generate high-quality images but require dozens of forward passes. We introduce Distribution Matching Distillation (DMD), a procedure to transform a diffusion model into a one-step image generator with minimal impact on image quality. We enforce the one-step image generator match the diffusion model at distribution level, by minimizing an approximate KL divergence whose gradient c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18828v4-abstract-full').style.display = 'inline'; document.getElementById('2311.18828v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18828v4-abstract-full" style="display: none;"> Diffusion models generate high-quality images but require dozens of forward passes. We introduce Distribution Matching Distillation (DMD), a procedure to transform a diffusion model into a one-step image generator with minimal impact on image quality. We enforce the one-step image generator match the diffusion model at distribution level, by minimizing an approximate KL divergence whose gradient can be expressed as the difference between 2 score functions, one of the target distribution and the other of the synthetic distribution being produced by our one-step generator. The score functions are parameterized as two diffusion models trained separately on each distribution. Combined with a simple regression loss matching the large-scale structure of the multi-step diffusion outputs, our method outperforms all published few-step diffusion approaches, reaching 2.62 FID on ImageNet 64x64 and 11.49 FID on zero-shot COCO-30k, comparable to Stable Diffusion but orders of magnitude faster. Utilizing FP16 inference, our model generates images at 20 FPS on modern hardware. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18828v4-abstract-full').style.display = 'none'; document.getElementById('2311.18828v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024, Project page: https://tianweiy.github.io/dmd/</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05590">arXiv:2310.05590</a> <span> [<a href="https://arxiv.org/pdf/2310.05590">pdf</a>, <a href="https://arxiv.org/format/2310.05590">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Perceptual Artifacts Localization for Image Synthesis Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lingzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhengjie Xu</a>, <a href="/search/cs?searchtype=author&query=Barnes%2C+C">Connelly Barnes</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qing Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Amirghodsi%2C+S">Sohrab Amirghodsi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jianbo Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05590v1-abstract-short" style="display: inline;"> Recent advancements in deep generative models have facilitated the creation of photo-realistic images across various tasks. However, these generated images often exhibit perceptual artifacts in specific regions, necessitating manual correction. In this study, we present a comprehensive empirical examination of Perceptual Artifacts Localization (PAL) spanning diverse image synthesis endeavors. We i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05590v1-abstract-full').style.display = 'inline'; document.getElementById('2310.05590v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05590v1-abstract-full" style="display: none;"> Recent advancements in deep generative models have facilitated the creation of photo-realistic images across various tasks. However, these generated images often exhibit perceptual artifacts in specific regions, necessitating manual correction. In this study, we present a comprehensive empirical examination of Perceptual Artifacts Localization (PAL) spanning diverse image synthesis endeavors. We introduce a novel dataset comprising 10,168 generated images, each annotated with per-pixel perceptual artifact labels across ten synthesis tasks. A segmentation model, trained on our proposed dataset, effectively localizes artifacts across a range of tasks. Additionally, we illustrate its proficiency in adapting to previously unseen models using minimal training samples. We further propose an innovative zoom-in inpainting pipeline that seamlessly rectifies perceptual artifacts in the generated images. Through our experimental analyses, we elucidate several practical downstream applications, such as automated artifact rectification, non-referential image quality evaluation, and abnormal region detection in images. The dataset and code are released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05590v1-abstract-full').style.display = 'none'; document.getElementById('2310.05590v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.04157">arXiv:2307.04157</a> <span> [<a href="https://arxiv.org/pdf/2307.04157">pdf</a>, <a href="https://arxiv.org/format/2307.04157">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DIFF-NST: Diffusion Interleaving For deFormable Neural Style Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ruta%2C+D">Dan Ruta</a>, <a href="/search/cs?searchtype=author&query=Tarr%C3%A9s%2C+G+C">Gemma Canet Tarr茅s</a>, <a href="/search/cs?searchtype=author&query=Gilbert%2C+A">Andrew Gilbert</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Kolkin%2C+N">Nicholas Kolkin</a>, <a href="/search/cs?searchtype=author&query=Collomosse%2C+J">John Collomosse</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.04157v2-abstract-short" style="display: inline;"> Neural Style Transfer (NST) is the field of study applying neural techniques to modify the artistic appearance of a content image to match the style of a reference style image. Traditionally, NST methods have focused on texture-based image edits, affecting mostly low level information and keeping most image structures the same. However, style-based deformation of the content is desirable for some… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04157v2-abstract-full').style.display = 'inline'; document.getElementById('2307.04157v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.04157v2-abstract-full" style="display: none;"> Neural Style Transfer (NST) is the field of study applying neural techniques to modify the artistic appearance of a content image to match the style of a reference style image. Traditionally, NST methods have focused on texture-based image edits, affecting mostly low level information and keeping most image structures the same. However, style-based deformation of the content is desirable for some styles, especially in cases where the style is abstract or the primary concept of the style is in its deformed rendition of some content. With the recent introduction of diffusion models, such as Stable Diffusion, we can access far more powerful image generation techniques, enabling new possibilities. In our work, we propose using this new class of models to perform style transfer while enabling deformable style transfer, an elusive capability in previous models. We show how leveraging the priors of these models can expose new artistic controls at inference time, and we document our findings in exploring this new direction for the field of style transfer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04157v2-abstract-full').style.display = 'none'; document.getElementById('2307.04157v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.06092">arXiv:2306.06092</a> <span> [<a href="https://arxiv.org/pdf/2306.06092">pdf</a>, <a href="https://arxiv.org/format/2306.06092">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Realistic Saliency Guided Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miangoleh%2C+S+M+H">S. Mahdi H. Miangoleh</a>, <a href="/search/cs?searchtype=author&query=Bylinskii%2C+Z">Zoya Bylinskii</a>, <a href="/search/cs?searchtype=author&query=Kee%2C+E">Eric Kee</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Aksoy%2C+Y">Ya臒谋z Aksoy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.06092v1-abstract-short" style="display: inline;"> Common editing operations performed by professional photographers include the cleanup operations: de-emphasizing distracting elements and enhancing subjects. These edits are challenging, requiring a delicate balance between manipulating the viewer's attention while maintaining photo realism. While recent approaches can boast successful examples of attention attenuation or amplification, most of th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06092v1-abstract-full').style.display = 'inline'; document.getElementById('2306.06092v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.06092v1-abstract-full" style="display: none;"> Common editing operations performed by professional photographers include the cleanup operations: de-emphasizing distracting elements and enhancing subjects. These edits are challenging, requiring a delicate balance between manipulating the viewer's attention while maintaining photo realism. While recent approaches can boast successful examples of attention attenuation or amplification, most of them also suffer from frequent unrealistic edits. We propose a realism loss for saliency-guided image enhancement to maintain high realism across varying image types, while attenuating distractors and amplifying objects of interest. Evaluations with professional photographers confirm that we achieve the dual objective of realism and effectiveness, and outperform the recent approaches on their own datasets, while requiring a smaller memory footprint and runtime. We thus offer a viable solution for automating image enhancement and photo cleanup operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06092v1-abstract-full').style.display = 'none'; document.getElementById('2306.06092v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">For more info visit http://yaksoy.github.io/realisticEditing/</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. CVPR (2023) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.17624">arXiv:2305.17624</a> <span> [<a href="https://arxiv.org/pdf/2305.17624">pdf</a>, <a href="https://arxiv.org/format/2305.17624">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SimpSON: Simplifying Photo Cleanup with Single-Click Distracting Object Segmentation Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huynh%2C+C">Chuong Huynh</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Barnes%2C+C">Connelly Barnes</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Amirghodsi%2C+S">Sohrab Amirghodsi</a>, <a href="/search/cs?searchtype=author&query=Shrivastava%2C+A">Abhinav Shrivastava</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.17624v1-abstract-short" style="display: inline;"> In photo editing, it is common practice to remove visual distractions to improve the overall image quality and highlight the primary subject. However, manually selecting and removing these small and dense distracting regions can be a laborious and time-consuming task. In this paper, we propose an interactive distractor selection method that is optimized to achieve the task with just a single click… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17624v1-abstract-full').style.display = 'inline'; document.getElementById('2305.17624v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.17624v1-abstract-full" style="display: none;"> In photo editing, it is common practice to remove visual distractions to improve the overall image quality and highlight the primary subject. However, manually selecting and removing these small and dense distracting regions can be a laborious and time-consuming task. In this paper, we propose an interactive distractor selection method that is optimized to achieve the task with just a single click. Our method surpasses the precision and recall achieved by the traditional method of running panoptic segmentation and then selecting the segments containing the clicks. We also showcase how a transformer-based module can be used to identify more distracting regions similar to the user's click position. Our experiments demonstrate that the model can effectively and accurately segment unknown distracting objects interactively and in groups. By significantly simplifying the photo cleaning and retouching process, our proposed model provides inspiration for exploring rare object segmentation and group selection with a single click. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17624v1-abstract-full').style.display = 'none'; document.getElementById('2305.17624v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2023. Project link: https://simpson-cvpr23.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.05139">arXiv:2304.05139</a> <span> [<a href="https://arxiv.org/pdf/2304.05139">pdf</a>, <a href="https://arxiv.org/format/2304.05139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NeAT: Neural Artistic Tracing for Beautiful Style Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ruta%2C+D">Dan Ruta</a>, <a href="/search/cs?searchtype=author&query=Gilbert%2C+A">Andrew Gilbert</a>, <a href="/search/cs?searchtype=author&query=Collomosse%2C+J">John Collomosse</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Kolkin%2C+N">Nicholas Kolkin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.05139v1-abstract-short" style="display: inline;"> Style transfer is the task of reproducing the semantic contents of a source image in the artistic style of a second target image. In this paper, we present NeAT, a new state-of-the art feed-forward style transfer method. We re-formulate feed-forward style transfer as image editing, rather than image generation, resulting in a model which improves over the state-of-the-art in both preserving the so… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.05139v1-abstract-full').style.display = 'inline'; document.getElementById('2304.05139v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.05139v1-abstract-full" style="display: none;"> Style transfer is the task of reproducing the semantic contents of a source image in the artistic style of a second target image. In this paper, we present NeAT, a new state-of-the art feed-forward style transfer method. We re-formulate feed-forward style transfer as image editing, rather than image generation, resulting in a model which improves over the state-of-the-art in both preserving the source content and matching the target style. An important component of our model's success is identifying and fixing "style halos", a commonly occurring artefact across many style transfer techniques. In addition to training and testing on standard datasets, we introduce the BBST-4M dataset, a new, large scale, high resolution dataset of 4M images. As a component of curating this data, we present a novel model able to classify if an image is stylistic. We use BBST-4M to improve and measure the generalization of NeAT across a huge variety of styles. Not only does NeAT offer state-of-the-art quality and generalization, it is designed and trained for fast inference at high resolution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.05139v1-abstract-full').style.display = 'none'; document.getElementById('2304.05139v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.00221">arXiv:2304.00221</a> <span> [<a href="https://arxiv.org/pdf/2304.00221">pdf</a>, <a href="https://arxiv.org/format/2304.00221">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automatic High Resolution Wire Segmentation and Removal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chiu%2C+M+T">Mang Tik Chiu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuaner Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zijun Wei</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Barnes%2C+C">Connelly Barnes</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Kainz%2C+F">Florian Kainz</a>, <a href="/search/cs?searchtype=author&query=Amirghodsi%2C+S">Sohrab Amirghodsi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Humphrey Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.00221v1-abstract-short" style="display: inline;"> Wires and powerlines are common visual distractions that often undermine the aesthetics of photographs. The manual process of precisely segmenting and removing them is extremely tedious and may take up hours, especially on high-resolution photos where wires may span the entire space. In this paper, we present an automatic wire clean-up system that eases the process of wire segmentation and removal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.00221v1-abstract-full').style.display = 'inline'; document.getElementById('2304.00221v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.00221v1-abstract-full" style="display: none;"> Wires and powerlines are common visual distractions that often undermine the aesthetics of photographs. The manual process of precisely segmenting and removing them is extremely tedious and may take up hours, especially on high-resolution photos where wires may span the entire space. In this paper, we present an automatic wire clean-up system that eases the process of wire segmentation and removal/inpainting to within a few seconds. We observe several unique challenges: wires are thin, lengthy, and sparse. These are rare properties of subjects that common segmentation tasks cannot handle, especially in high-resolution images. We thus propose a two-stage method that leverages both global and local contexts to accurately segment wires in high-resolution images efficiently, and a tile-based inpainting strategy to remove the wires given our predicted segmentation masks. We also introduce the first wire segmentation benchmark dataset, WireSegHR. Finally, we demonstrate quantitatively and qualitatively that our wire clean-up system enables fully automated wire removal with great generalization to various wire appearances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.00221v1-abstract-full').style.display = 'none'; document.getElementById('2304.00221v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/adobe-research/auto-wire-removal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.13516">arXiv:2303.13516</a> <span> [<a href="https://arxiv.org/pdf/2303.13516">pdf</a>, <a href="https://arxiv.org/format/2303.13516">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Ablating Concepts in Text-to-Image Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumari%2C+N">Nupur Kumari</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bingliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng-Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun-Yan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.13516v3-abstract-short" style="display: inline;"> Large-scale text-to-image diffusion models can generate high-fidelity images with powerful compositional ability. However, these models are typically trained on an enormous amount of Internet data, often containing copyrighted material, licensed images, and personal photos. Furthermore, they have been found to replicate the style of various living artists or memorize exact training samples. How ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13516v3-abstract-full').style.display = 'inline'; document.getElementById('2303.13516v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.13516v3-abstract-full" style="display: none;"> Large-scale text-to-image diffusion models can generate high-fidelity images with powerful compositional ability. However, these models are typically trained on an enormous amount of Internet data, often containing copyrighted material, licensed images, and personal photos. Furthermore, they have been found to replicate the style of various living artists or memorize exact training samples. How can we remove such copyrighted concepts or images without retraining the model from scratch? To achieve this goal, we propose an efficient method of ablating concepts in the pretrained model, i.e., preventing the generation of a target concept. Our algorithm learns to match the image distribution for a target style, instance, or text prompt we wish to ablate to the distribution corresponding to an anchor concept. This prevents the model from generating target concepts given its text condition. Extensive experiments show that our method can successfully prevent the generation of the ablated concept while preserving closely related concepts in the model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13516v3-abstract-full').style.display = 'none'; document.getElementById('2303.13516v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2023. Project website: https://www.cs.cmu.edu/~concept-ablation/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.05511">arXiv:2303.05511</a> <span> [<a href="https://arxiv.org/pdf/2303.05511">pdf</a>, <a href="https://arxiv.org/format/2303.05511">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scaling up GANs for Text-to-Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+M">Minguk Kang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun-Yan Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaesik Park</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Paris%2C+S">Sylvain Paris</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taesung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.05511v2-abstract-short" style="display: inline;"> The recent success of text-to-image synthesis has taken the world by storm and captured the general public's imagination. From a technical standpoint, it also marked a drastic change in the favored architecture to design generative image models. GANs used to be the de facto choice, with techniques like StyleGAN. With DALL-E 2, auto-regressive and diffusion models became the new standard for large-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.05511v2-abstract-full').style.display = 'inline'; document.getElementById('2303.05511v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.05511v2-abstract-full" style="display: none;"> The recent success of text-to-image synthesis has taken the world by storm and captured the general public's imagination. From a technical standpoint, it also marked a drastic change in the favored architecture to design generative image models. GANs used to be the de facto choice, with techniques like StyleGAN. With DALL-E 2, auto-regressive and diffusion models became the new standard for large-scale generative models overnight. This rapid shift raises a fundamental question: can we scale up GANs to benefit from large datasets like LAION? We find that na脧vely increasing the capacity of the StyleGAN architecture quickly becomes unstable. We introduce GigaGAN, a new GAN architecture that far exceeds this limit, demonstrating GANs as a viable option for text-to-image synthesis. GigaGAN offers three major advantages. First, it is orders of magnitude faster at inference time, taking only 0.13 seconds to synthesize a 512px image. Second, it can synthesize high-resolution images, for example, 16-megapixel pixels in 3.66 seconds. Finally, GigaGAN supports various latent space editing applications such as latent interpolation, style mixing, and vector arithmetic operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.05511v2-abstract-full').style.display = 'none'; document.getElementById('2303.05511v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2023. Project webpage at https://mingukkang.github.io/GigaGAN/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.00157">arXiv:2303.00157</a> <span> [<a href="https://arxiv.org/pdf/2303.00157">pdf</a>, <a href="https://arxiv.org/format/2303.00157">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semi-supervised Parametric Real-world Image Harmonization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&query=Gharbi%2C+M">Micha毛l Gharbi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhihao Xia</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.00157v1-abstract-short" style="display: inline;"> Learning-based image harmonization techniques are usually trained to undo synthetic random global transformations applied to a masked foreground in a single ground truth photo. This simulated data does not model many of the important appearance mismatches (illumination, object boundaries, etc.) between foreground and background in real composites, leading to models that do not generalize well and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.00157v1-abstract-full').style.display = 'inline'; document.getElementById('2303.00157v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.00157v1-abstract-full" style="display: none;"> Learning-based image harmonization techniques are usually trained to undo synthetic random global transformations applied to a masked foreground in a single ground truth photo. This simulated data does not model many of the important appearance mismatches (illumination, object boundaries, etc.) between foreground and background in real composites, leading to models that do not generalize well and cannot model complex local changes. We propose a new semi-supervised training strategy that addresses this problem and lets us learn complex local appearance harmonization from unpaired real composites, where foreground and background come from different images. Our model is fully parametric. It uses RGB curves to correct the global colors and tone and a shading map to model local variations. Our method outperforms previous work on established benchmarks and real composites, as shown in a user study, and processes high-resolution images interactively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.00157v1-abstract-full').style.display = 'none'; document.getElementById('2303.00157v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 16 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.05225">arXiv:2301.05225</a> <span> [<a href="https://arxiv.org/pdf/2301.05225">pdf</a>, <a href="https://arxiv.org/format/2301.05225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Domain Expansion of Image Generators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nitzan%2C+Y">Yotam Nitzan</a>, <a href="/search/cs?searchtype=author&query=Gharbi%2C+M">Micha毛l Gharbi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taesung Park</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun-Yan Zhu</a>, <a href="/search/cs?searchtype=author&query=Cohen-Or%2C+D">Daniel Cohen-Or</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.05225v2-abstract-short" style="display: inline;"> Can one inject new concepts into an already trained generative model, while respecting its existing structure and knowledge? We propose a new task - domain expansion - to address this. Given a pretrained generator and novel (but related) domains, we expand the generator to jointly model all domains, old and new, harmoniously. First, we note the generator contains a meaningful, pretrained latent sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.05225v2-abstract-full').style.display = 'inline'; document.getElementById('2301.05225v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.05225v2-abstract-full" style="display: none;"> Can one inject new concepts into an already trained generative model, while respecting its existing structure and knowledge? We propose a new task - domain expansion - to address this. Given a pretrained generator and novel (but related) domains, we expand the generator to jointly model all domains, old and new, harmoniously. First, we note the generator contains a meaningful, pretrained latent space. Is it possible to minimally perturb this hard-earned representation, while maximally representing the new domains? Interestingly, we find that the latent space offers unused, "dormant" directions, which do not affect the output. This provides an opportunity: By "repurposing" these directions, we can represent new domains without perturbing the original representation. In fact, we find that pretrained generators have the capacity to add several - even hundreds - of new domains! Using our expansion method, one "expanded" model can supersede numerous domain-specific models, without expanding the model size. Additionally, a single expanded generator natively supports smooth transitions between domains, as well as composition of domains. Code and project page available at https://yotamnitzan.github.io/domain-expansion/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.05225v2-abstract-full').style.display = 'none'; document.getElementById('2301.05225v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page and code are available at https://yotamnitzan.github.io/domain-expansion/. CVPR 2023 Camera-Ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.06310">arXiv:2212.06310</a> <span> [<a href="https://arxiv.org/pdf/2212.06310">pdf</a>, <a href="https://arxiv.org/format/2212.06310">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Structure-Guided Image Completion with Image-level and Object-level Semantic Discriminators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Haitian Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jingwan Lu</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+S">Scott Cohen</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Barnes%2C+C">Connelly Barnes</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qing Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Amirghodsi%2C+S">Sohrab Amirghodsi</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.06310v2-abstract-short" style="display: inline;"> Structure-guided image completion aims to inpaint a local region of an image according to an input guidance map from users. While such a task enables many practical applications for interactive editing, existing methods often struggle to hallucinate realistic object instances in complex natural scenes. Such a limitation is partially due to the lack of semantic-level constraints inside the hole reg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.06310v2-abstract-full').style.display = 'inline'; document.getElementById('2212.06310v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.06310v2-abstract-full" style="display: none;"> Structure-guided image completion aims to inpaint a local region of an image according to an input guidance map from users. While such a task enables many practical applications for interactive editing, existing methods often struggle to hallucinate realistic object instances in complex natural scenes. Such a limitation is partially due to the lack of semantic-level constraints inside the hole region as well as the lack of a mechanism to enforce realistic object generation. In this work, we propose a learning paradigm that consists of semantic discriminators and object-level discriminators for improving the generation of complex semantics and objects. Specifically, the semantic discriminators leverage pretrained visual features to improve the realism of the generated visual concepts. Moreover, the object-level discriminators take aligned instances as inputs to enforce the realism of individual objects. Our proposed scheme significantly improves the generation quality and achieves state-of-the-art results on various tasks, including segmentation-guided completion, edge-guided manipulation and panoptically-guided manipulation on Places2 datasets. Furthermore, our trained model is flexible and can support multiple editing use cases, such as object insertion, replacement, removal and standard inpainting. In particular, our trained model combined with a novel automatic image completion pipeline achieves state-of-the-art results on the standard inpainting task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.06310v2-abstract-full').style.display = 'none'; document.getElementById('2212.06310v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.04488">arXiv:2212.04488</a> <span> [<a href="https://arxiv.org/pdf/2212.04488">pdf</a>, <a href="https://arxiv.org/format/2212.04488">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Concept Customization of Text-to-Image Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumari%2C+N">Nupur Kumari</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bingliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun-Yan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.04488v2-abstract-short" style="display: inline;"> While generative models produce high-quality images of concepts learned from a large-scale database, a user often wishes to synthesize instantiations of their own concepts (for example, their family, pets, or items). Can we teach a model to quickly acquire a new concept, given a few examples? Furthermore, can we compose multiple new concepts together? We propose Custom Diffusion, an efficient meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.04488v2-abstract-full').style.display = 'inline'; document.getElementById('2212.04488v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.04488v2-abstract-full" style="display: none;"> While generative models produce high-quality images of concepts learned from a large-scale database, a user often wishes to synthesize instantiations of their own concepts (for example, their family, pets, or items). Can we teach a model to quickly acquire a new concept, given a few examples? Furthermore, can we compose multiple new concepts together? We propose Custom Diffusion, an efficient method for augmenting existing text-to-image models. We find that only optimizing a few parameters in the text-to-image conditioning mechanism is sufficiently powerful to represent new concepts while enabling fast tuning (~6 minutes). Additionally, we can jointly train for multiple concepts or combine multiple fine-tuned models into one via closed-form constrained optimization. Our fine-tuned model generates variations of multiple new concepts and seamlessly composes them with existing concepts in novel settings. Our method outperforms or performs on par with several baselines and concurrent works in both qualitative and quantitative evaluations while being memory and computationally efficient. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.04488v2-abstract-full').style.display = 'none'; document.getElementById('2212.04488v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updated v2 with results on the new CustomConcept101 dataset https://www.cs.cmu.edu/~custom-diffusion/dataset.html Project webpage: https://www.cs.cmu.edu/~custom-diffusion</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.02707">arXiv:2211.02707</a> <span> [<a href="https://arxiv.org/pdf/2211.02707">pdf</a>, <a href="https://arxiv.org/format/2211.02707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Contrastive Learning for Diverse Disentangled Foreground Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuheng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijun Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jingwan Lu</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y+J">Yong Jae Lee</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+K+K">Krishna Kumar Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.02707v1-abstract-short" style="display: inline;"> We introduce a new method for diverse foreground generation with explicit control over various factors. Existing image inpainting based foreground generation methods often struggle to generate diverse results and rarely allow users to explicitly control specific factors of variation (e.g., varying the facial identity or expression for face inpainting results). We leverage contrastive learning with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.02707v1-abstract-full').style.display = 'inline'; document.getElementById('2211.02707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.02707v1-abstract-full" style="display: none;"> We introduce a new method for diverse foreground generation with explicit control over various factors. Existing image inpainting based foreground generation methods often struggle to generate diverse results and rarely allow users to explicitly control specific factors of variation (e.g., varying the facial identity or expression for face inpainting results). We leverage contrastive learning with latent codes to generate diverse foreground results for the same masked input. Specifically, we define two sets of latent codes, where one controls a pre-defined factor (``known''), and the other controls the remaining factors (``unknown''). The sampled latent codes from the two sets jointly bi-modulate the convolution kernels to guide the generator to synthesize diverse results. Experiments demonstrate the superiority of our method over state-of-the-arts in result diversity and generation controllability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.02707v1-abstract-full').style.display = 'none'; document.getElementById('2211.02707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.03953">arXiv:2209.03953</a> <span> [<a href="https://arxiv.org/pdf/2209.03953">pdf</a>, <a href="https://arxiv.org/format/2209.03953">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Text-Free Learning of a Natural Language Interface for Pretrained Face Generators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaodan Du</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+R+A">Raymond A. Yeh</a>, <a href="/search/cs?searchtype=author&query=Kolkin%2C+N">Nicholas Kolkin</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Shakhnarovich%2C+G">Greg Shakhnarovich</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.03953v1-abstract-short" style="display: inline;"> We propose Fast text2StyleGAN, a natural language interface that adapts pre-trained GANs for text-guided human face synthesis. Leveraging the recent advances in Contrastive Language-Image Pre-training (CLIP), no text data is required during training. Fast text2StyleGAN is formulated as a conditional variational autoencoder (CVAE) that provides extra control and diversity to the generated images at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.03953v1-abstract-full').style.display = 'inline'; document.getElementById('2209.03953v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.03953v1-abstract-full" style="display: none;"> We propose Fast text2StyleGAN, a natural language interface that adapts pre-trained GANs for text-guided human face synthesis. Leveraging the recent advances in Contrastive Language-Image Pre-training (CLIP), no text data is required during training. Fast text2StyleGAN is formulated as a conditional variational autoencoder (CVAE) that provides extra control and diversity to the generated images at test time. Our model does not require re-training or fine-tuning of the GANs or CLIP when encountering new text prompts. In contrast to prior work, we do not rely on optimization at test time, making our method orders of magnitude faster than prior work. Empirically, on FFHQ dataset, our method offers faster and more accurate generation of images from natural language descriptions with varying levels of detail compared to prior work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.03953v1-abstract-full').style.display = 'none'; document.getElementById('2209.03953v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.03552">arXiv:2208.03552</a> <span> [<a href="https://arxiv.org/pdf/2208.03552">pdf</a>, <a href="https://arxiv.org/format/2208.03552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Inpainting at Modern Camera Resolution by Guided PatchMatch with Auto-Curation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lingzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Barnes%2C+C">Connelly Barnes</a>, <a href="/search/cs?searchtype=author&query=Wampler%2C+K">Kevin Wampler</a>, <a href="/search/cs?searchtype=author&query=Amirghodsi%2C+S">Sohrab Amirghodsi</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jianbo Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.03552v1-abstract-short" style="display: inline;"> Recently, deep models have established SOTA performance for low-resolution image inpainting, but they lack fidelity at resolutions associated with modern cameras such as 4K or more, and for large holes. We contribute an inpainting benchmark dataset of photos at 4K and above representative of modern sensors. We demonstrate a novel framework that combines deep learning and traditional methods. We us… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03552v1-abstract-full').style.display = 'inline'; document.getElementById('2208.03552v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.03552v1-abstract-full" style="display: none;"> Recently, deep models have established SOTA performance for low-resolution image inpainting, but they lack fidelity at resolutions associated with modern cameras such as 4K or more, and for large holes. We contribute an inpainting benchmark dataset of photos at 4K and above representative of modern sensors. We demonstrate a novel framework that combines deep learning and traditional methods. We use an existing deep inpainting model LaMa to fill the hole plausibly, establish three guide images consisting of structure, segmentation, depth, and apply a multiply-guided PatchMatch to produce eight candidate upsampled inpainted images. Next, we feed all candidate inpaintings through a novel curation module that chooses a good inpainting by column summation on an 8x8 antisymmetric pairwise preference matrix. Our framework's results are overwhelmingly preferred by users over 8 strong baselines, with improvements of quantitative metrics up to 7.4 over the best baseline LaMa, and our technique when paired with 4 different SOTA inpainting backbones improves each such that ours is overwhelmingly preferred by users over a strong super-res baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03552v1-abstract-full').style.display = 'none'; document.getElementById('2208.03552v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages, 15 figures, ECCV 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.03357">arXiv:2208.03357</a> <span> [<a href="https://arxiv.org/pdf/2208.03357">pdf</a>, <a href="https://arxiv.org/format/2208.03357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Perceptual Artifacts Localization for Inpainting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lingzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Barnes%2C+C">Connelly Barnes</a>, <a href="/search/cs?searchtype=author&query=Amirghodsi%2C+S">Sohrab Amirghodsi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jianbo Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.03357v1-abstract-short" style="display: inline;"> Image inpainting is an essential task for multiple practical applications like object removal and image editing. Deep GAN-based models greatly improve the inpainting performance in structures and textures within the hole, but might also generate unexpected artifacts like broken structures or color blobs. Users perceive these artifacts to judge the effectiveness of inpainting models, and retouch th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03357v1-abstract-full').style.display = 'inline'; document.getElementById('2208.03357v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.03357v1-abstract-full" style="display: none;"> Image inpainting is an essential task for multiple practical applications like object removal and image editing. Deep GAN-based models greatly improve the inpainting performance in structures and textures within the hole, but might also generate unexpected artifacts like broken structures or color blobs. Users perceive these artifacts to judge the effectiveness of inpainting models, and retouch these imperfect areas to inpaint again in a typical retouching workflow. Inspired by this workflow, we propose a new learning task of automatic segmentation of inpainting perceptual artifacts, and apply the model for inpainting model evaluation and iterative refinement. Specifically, we first construct a new inpainting artifacts dataset by manually annotating perceptual artifacts in the results of state-of-the-art inpainting models. Then we train advanced segmentation networks on this dataset to reliably localize inpainting artifacts within inpainted images. Second, we propose a new interpretable evaluation metric called Perceptual Artifact Ratio (PAR), which is the ratio of objectionable inpainted regions to the entire inpainted area. PAR demonstrates a strong correlation with real user preference. Finally, we further apply the generated masks for iterative image inpainting by combining our approach with multiple recent inpainting methods. Extensive experiments demonstrate the consistent decrease of artifact regions and inpainting quality improvement across the different methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03357v1-abstract-full').style.display = 'none'; document.getElementById('2208.03357v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.05385">arXiv:2207.05385</a> <span> [<a href="https://arxiv.org/pdf/2207.05385">pdf</a>, <a href="https://arxiv.org/format/2207.05385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Controllable Shadow Generation Using Pixel Height Maps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sheng%2C+Y">Yichen Sheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wei Yin</a>, <a href="/search/cs?searchtype=author&query=Oztireli%2C+A+C">A. Cengiz Oztireli</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Benes%2C+B">Bedrich Benes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.05385v2-abstract-short" style="display: inline;"> Shadows are essential for realistic image compositing. Physics-based shadow rendering methods require 3D geometries, which are not always available. Deep learning-based shadow synthesis methods learn a mapping from the light information to an object's shadow without explicitly modeling the shadow geometry. Still, they lack control and are prone to visual artifacts. We introduce pixel heigh, a nove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05385v2-abstract-full').style.display = 'inline'; document.getElementById('2207.05385v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.05385v2-abstract-full" style="display: none;"> Shadows are essential for realistic image compositing. Physics-based shadow rendering methods require 3D geometries, which are not always available. Deep learning-based shadow synthesis methods learn a mapping from the light information to an object's shadow without explicitly modeling the shadow geometry. Still, they lack control and are prone to visual artifacts. We introduce pixel heigh, a novel geometry representation that encodes the correlations between objects, ground, and camera pose. The pixel height can be calculated from 3D geometries, manually annotated on 2D images, and can also be predicted from a single-view RGB image by a supervised approach. It can be used to calculate hard shadows in a 2D image based on the projective geometry, providing precise control of the shadows' direction and shape. Furthermore, we propose a data-driven soft shadow generator to apply softness to a hard shadow based on a softness input parameter. Qualitative and quantitative evaluations demonstrate that the proposed pixel height significantly improves the quality of the shadow generation while allowing for controllability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05385v2-abstract-full').style.display = 'none'; document.getElementById('2207.05385v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.06481">arXiv:2206.06481</a> <span> [<a href="https://arxiv.org/pdf/2206.06481">pdf</a>, <a href="https://arxiv.org/format/2206.06481">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RigNeRF: Fully Controllable Neural 3D Portraits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Athar%2C+S">ShahRukh Athar</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Sunkavalli%2C+K">Kalyan Sunkavalli</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Z">Zhixin Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.06481v1-abstract-short" style="display: inline;"> Volumetric neural rendering methods, such as neural radiance fields (NeRFs), have enabled photo-realistic novel view synthesis. However, in their standard form, NeRFs do not support the editing of objects, such as a human head, within a scene. In this work, we propose RigNeRF, a system that goes beyond just novel view synthesis and enables full control of head pose and facial expressions learned f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.06481v1-abstract-full').style.display = 'inline'; document.getElementById('2206.06481v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.06481v1-abstract-full" style="display: none;"> Volumetric neural rendering methods, such as neural radiance fields (NeRFs), have enabled photo-realistic novel view synthesis. However, in their standard form, NeRFs do not support the editing of objects, such as a human head, within a scene. In this work, we propose RigNeRF, a system that goes beyond just novel view synthesis and enables full control of head pose and facial expressions learned from a single portrait video. We model changes in head pose and facial expressions using a deformation field that is guided by a 3D morphable face model (3DMM). The 3DMM effectively acts as a prior for RigNeRF that learns to predict only residuals to the 3DMM deformations and allows us to render novel (rigid) poses and (non-rigid) expressions that were not present in the input sequence. Using only a smartphone-captured short video of a subject for training, we demonstrate the effectiveness of our method on free view synthesis of a portrait scene with explicit head pose and expression controls. The project page can be found here: http://shahrukhathar.github.io/2022/06/06/RigNeRF.html <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.06481v1-abstract-full').style.display = 'none'; document.getElementById('2206.06481v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The project page can be found here: http://shahrukhathar.github.io/2022/06/06/RigNeRF.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.06360">arXiv:2206.06360</a> <span> [<a href="https://arxiv.org/pdf/2206.06360">pdf</a>, <a href="https://arxiv.org/format/2206.06360">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ARF: Artistic Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Kolkin%2C+N">Nick Kolkin</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.06360v1-abstract-short" style="display: inline;"> We present a method for transferring the artistic features of an arbitrary style image to a 3D scene. Previous methods that perform 3D stylization on point clouds or meshes are sensitive to geometric reconstruction errors for complex real-world scenes. Instead, we propose to stylize the more robust radiance field representation. We find that the commonly used Gram matrix-based loss tends to produc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.06360v1-abstract-full').style.display = 'inline'; document.getElementById('2206.06360v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.06360v1-abstract-full" style="display: none;"> We present a method for transferring the artistic features of an arbitrary style image to a 3D scene. Previous methods that perform 3D stylization on point clouds or meshes are sensitive to geometric reconstruction errors for complex real-world scenes. Instead, we propose to stylize the more robust radiance field representation. We find that the commonly used Gram matrix-based loss tends to produce blurry results without faithful brushstrokes, and introduce a nearest neighbor-based loss that is highly effective at capturing style details while maintaining multi-view consistency. We also propose a novel deferred back-propagation method to enable optimization of memory-intensive radiance fields using style losses defined on full-resolution rendered images. Our extensive evaluation demonstrates that our method outperforms baselines by generating artistic appearance that more closely resembles the style image. Please check our project page for video results and open-source implementations: https://www.cs.cornell.edu/projects/arf/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.06360v1-abstract-full').style.display = 'none'; document.getElementById('2206.06360v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://www.cs.cornell.edu/projects/arf/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.02837">arXiv:2205.02837</a> <span> [<a href="https://arxiv.org/pdf/2205.02837">pdf</a>, <a href="https://arxiv.org/format/2205.02837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BlobGAN: Spatially Disentangled Scene Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Epstein%2C+D">Dave Epstein</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taesung Park</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Efros%2C+A+A">Alexei A. Efros</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.02837v2-abstract-short" style="display: inline;"> We propose an unsupervised, mid-level representation for a generative model of scenes. The representation is mid-level in that it is neither per-pixel nor per-image; rather, scenes are modeled as a collection of spatial, depth-ordered "blobs" of features. Blobs are differentiably placed onto a feature grid that is decoded into an image by a generative adversarial network. Due to the spatial unifor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.02837v2-abstract-full').style.display = 'inline'; document.getElementById('2205.02837v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.02837v2-abstract-full" style="display: none;"> We propose an unsupervised, mid-level representation for a generative model of scenes. The representation is mid-level in that it is neither per-pixel nor per-image; rather, scenes are modeled as a collection of spatial, depth-ordered "blobs" of features. Blobs are differentiably placed onto a feature grid that is decoded into an image by a generative adversarial network. Due to the spatial uniformity of blobs and the locality inherent to convolution, our network learns to associate different blobs with different entities in a scene and to arrange these blobs to capture scene layout. We demonstrate this emergent behavior by showing that, despite training without any supervision, our method enables applications such as easy manipulation of objects within a scene (e.g., moving, removing, and restyling furniture), creation of feasible scenes given constraints (e.g., plausible rooms with drawers at a particular location), and parsing of real-world images into constituent parts. On a challenging multi-category dataset of indoor scenes, BlobGAN outperforms StyleGAN2 in image quality as measured by FID. See our project page for video results and interactive demo: https://www.dave.ml/blobgan <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.02837v2-abstract-full').style.display = 'none'; document.getElementById('2205.02837v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2022. Project webpage available at https://www.dave.ml/blobgan</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.07156">arXiv:2204.07156</a> <span> [<a href="https://arxiv.org/pdf/2204.07156">pdf</a>, <a href="https://arxiv.org/format/2204.07156">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Any-resolution Training for High-resolution Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chai%2C+L">Lucy Chai</a>, <a href="/search/cs?searchtype=author&query=Gharbi%2C+M">Michael Gharbi</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Isola%2C+P">Phillip Isola</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.07156v2-abstract-short" style="display: inline;"> Generative models operate at fixed resolution, even though natural images come in a variety of sizes. As high-resolution details are downsampled away and low-resolution images are discarded altogether, precious supervision is lost. We argue that every pixel matters and create datasets with variable-size images, collected at their native resolutions. To take advantage of varied-size data, we introd… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.07156v2-abstract-full').style.display = 'inline'; document.getElementById('2204.07156v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.07156v2-abstract-full" style="display: none;"> Generative models operate at fixed resolution, even though natural images come in a variety of sizes. As high-resolution details are downsampled away and low-resolution images are discarded altogether, precious supervision is lost. We argue that every pixel matters and create datasets with variable-size images, collected at their native resolutions. To take advantage of varied-size data, we introduce continuous-scale training, a process that samples patches at random scales to train a new generator with variable output resolutions. First, conditioning the generator on a target scale allows us to generate higher resolution images than previously possible, without adding layers to the model. Second, by conditioning on continuous coordinates, we can sample patches that still obey a consistent global layout, which also allows for scalable training at higher resolutions. Controlled FFHQ experiments show that our method can take advantage of multi-resolution training data better than discrete multi-scale approaches, achieving better FID scores and cleaner high-frequency details. We also train on other natural image domains including churches, mountains, and birds, and demonstrate arbitrary scale synthesis with both coherent global layouts and realistic local details, going beyond 2K resolution in our experiments. Our project page is available at: https://chail.github.io/anyres-gan/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.07156v2-abstract-full').style.display = 'none'; document.getElementById('2204.07156v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2022 camera ready version; project page https://chail.github.io/anyres-gan/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.13215">arXiv:2203.13215</a> <span> [<a href="https://arxiv.org/pdf/2203.13215">pdf</a>, <a href="https://arxiv.org/format/2203.13215">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Neural Neighbor Style Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kolkin%2C+N">Nicholas Kolkin</a>, <a href="/search/cs?searchtype=author&query=Kucera%2C+M">Michal Kucera</a>, <a href="/search/cs?searchtype=author&query=Paris%2C+S">Sylvain Paris</a>, <a href="/search/cs?searchtype=author&query=Sykora%2C+D">Daniel Sykora</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Shakhnarovich%2C+G">Greg Shakhnarovich</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.13215v1-abstract-short" style="display: inline;"> We propose Neural Neighbor Style Transfer (NNST), a pipeline that offers state-of-the-art quality, generalization, and competitive efficiency for artistic style transfer. Our approach is based on explicitly replacing neural features extracted from the content input (to be stylized) with those from a style exemplar, then synthesizing the final output based on these rearranged features. While the sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13215v1-abstract-full').style.display = 'inline'; document.getElementById('2203.13215v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.13215v1-abstract-full" style="display: none;"> We propose Neural Neighbor Style Transfer (NNST), a pipeline that offers state-of-the-art quality, generalization, and competitive efficiency for artistic style transfer. Our approach is based on explicitly replacing neural features extracted from the content input (to be stylized) with those from a style exemplar, then synthesizing the final output based on these rearranged features. While the spirit of our approach is similar to prior work, we show that our design decisions dramatically improve the final visual quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13215v1-abstract-full').style.display = 'none'; document.getElementById('2203.13215v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code for NNST-Opt available at https://github.com/nkolkin13/NeuralNeighborStyleTransfer</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.11947">arXiv:2203.11947</a> <span> [<a href="https://arxiv.org/pdf/2203.11947">pdf</a>, <a href="https://arxiv.org/format/2203.11947">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CM-GAN: Image Inpainting with Cascaded Modulation GAN and Object-Aware Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Haitian Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jingwan Lu</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+S">Scott Cohen</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Barnes%2C+C">Connelly Barnes</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Ning Xu</a>, <a href="/search/cs?searchtype=author&query=Amirghodsi%2C+S">Sohrab Amirghodsi</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.11947v3-abstract-short" style="display: inline;"> Recent image inpainting methods have made great progress but often struggle to generate plausible image structures when dealing with large holes in complex images. This is partially due to the lack of effective network structures that can capture both the long-range dependency and high-level semantics of an image. We propose cascaded modulation GAN (CM-GAN), a new network design consisting of an e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.11947v3-abstract-full').style.display = 'inline'; document.getElementById('2203.11947v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.11947v3-abstract-full" style="display: none;"> Recent image inpainting methods have made great progress but often struggle to generate plausible image structures when dealing with large holes in complex images. This is partially due to the lack of effective network structures that can capture both the long-range dependency and high-level semantics of an image. We propose cascaded modulation GAN (CM-GAN), a new network design consisting of an encoder with Fourier convolution blocks that extract multi-scale feature representations from the input image with holes and a dual-stream decoder with a novel cascaded global-spatial modulation block at each scale level. In each decoder block, global modulation is first applied to perform coarse and semantic-aware structure synthesis, followed by spatial modulation to further adjust the feature map in a spatially adaptive fashion. In addition, we design an object-aware training scheme to prevent the network from hallucinating new objects inside holes, fulfilling the needs of object removal tasks in real-world scenarios. Extensive experiments are conducted to show that our method significantly outperforms existing methods in both quantitative and qualitative evaluation. Please refer to the project page: \url{https://github.com/htzheng/CM-GAN-Inpainting}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.11947v3-abstract-full').style.display = 'none'; document.getElementById('2203.11947v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, 19 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.07293">arXiv:2203.07293</a> <span> [<a href="https://arxiv.org/pdf/2203.07293">pdf</a>, <a href="https://arxiv.org/format/2203.07293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InsetGAN for Full-Body Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fr%C3%BChst%C3%BCck%2C+A">Anna Fr眉hst眉ck</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+K+K">Krishna Kumar Singh</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+N+J">Niloy J. Mitra</a>, <a href="/search/cs?searchtype=author&query=Wonka%2C+P">Peter Wonka</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jingwan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.07293v1-abstract-short" style="display: inline;"> While GANs can produce photo-realistic images in ideal conditions for certain domains, the generation of full-body human images remains difficult due to the diversity of identities, hairstyles, clothing, and the variance in pose. Instead of modeling this complex domain with a single GAN, we propose a novel method to combine multiple pretrained GANs, where one GAN generates a global canvas (e.g., h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.07293v1-abstract-full').style.display = 'inline'; document.getElementById('2203.07293v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.07293v1-abstract-full" style="display: none;"> While GANs can produce photo-realistic images in ideal conditions for certain domains, the generation of full-body human images remains difficult due to the diversity of identities, hairstyles, clothing, and the variance in pose. Instead of modeling this complex domain with a single GAN, we propose a novel method to combine multiple pretrained GANs, where one GAN generates a global canvas (e.g., human body) and a set of specialized GANs, or insets, focus on different parts (e.g., faces, shoes) that can be seamlessly inserted onto the global canvas. We model the problem as jointly exploring the respective latent spaces such that the generated images can be combined, by inserting the parts from the specialized generators onto the global canvas, without introducing seams. We demonstrate the setup by combining a full body GAN with a dedicated high-quality face GAN to produce plausible-looking humans. We evaluate our results with quantitative metrics and user studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.07293v1-abstract-full').style.display = 'none'; document.getElementById('2203.07293v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project webpage and video available at http://afruehstueck.github.io/insetgan</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.13433">arXiv:2201.13433</a> <span> [<a href="https://arxiv.org/pdf/2201.13433">pdf</a>, <a href="https://arxiv.org/format/2201.13433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Third Time's the Charm? Image and Video Editing with StyleGAN3 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Alaluf%2C+Y">Yuval Alaluf</a>, <a href="/search/cs?searchtype=author&query=Patashnik%2C+O">Or Patashnik</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zongze Wu</a>, <a href="/search/cs?searchtype=author&query=Zamir%2C+A">Asif Zamir</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Lischinski%2C+D">Dani Lischinski</a>, <a href="/search/cs?searchtype=author&query=Cohen-Or%2C+D">Daniel Cohen-Or</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.13433v1-abstract-short" style="display: inline;"> StyleGAN is arguably one of the most intriguing and well-studied generative models, demonstrating impressive performance in image generation, inversion, and manipulation. In this work, we explore the recent StyleGAN3 architecture, compare it to its predecessor, and investigate its unique advantages, as well as drawbacks. In particular, we demonstrate that while StyleGAN3 can be trained on unaligne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.13433v1-abstract-full').style.display = 'inline'; document.getElementById('2201.13433v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.13433v1-abstract-full" style="display: none;"> StyleGAN is arguably one of the most intriguing and well-studied generative models, demonstrating impressive performance in image generation, inversion, and manipulation. In this work, we explore the recent StyleGAN3 architecture, compare it to its predecessor, and investigate its unique advantages, as well as drawbacks. In particular, we demonstrate that while StyleGAN3 can be trained on unaligned data, one can still use aligned data for training, without hindering the ability to generate unaligned imagery. Next, our analysis of the disentanglement of the different latent spaces of StyleGAN3 indicates that the commonly used W/W+ spaces are more entangled than their StyleGAN2 counterparts, underscoring the benefits of using the StyleSpace for fine-grained editing. Considering image inversion, we observe that existing encoder-based techniques struggle when trained on unaligned data. We therefore propose an encoding scheme trained solely on aligned data, yet can still invert unaligned images. Finally, we introduce a novel video inversion and editing workflow that leverages the capabilities of a fine-tuned StyleGAN3 generator to reduce texture sticking and expand the field of view of the edited video. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.13433v1-abstract-full').style.display = 'none'; document.getElementById('2201.13433v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page available at https://yuval-alaluf.github.io/stylegan3-editing/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.08131">arXiv:2201.08131</a> <span> [<a href="https://arxiv.org/pdf/2201.08131">pdf</a>, <a href="https://arxiv.org/format/2201.08131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GeoFill: Reference-Based Image Inpainting with Better Geometric Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yunhan Zhao</a>, <a href="/search/cs?searchtype=author&query=Barnes%2C+C">Connelly Barnes</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Amirghodsi%2C+S">Sohrab Amirghodsi</a>, <a href="/search/cs?searchtype=author&query=Fowlkes%2C+C">Charless Fowlkes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.08131v2-abstract-short" style="display: inline;"> Reference-guided image inpainting restores image pixels by leveraging the content from another single reference image. The primary challenge is how to precisely place the pixels from the reference image into the hole region. Therefore, understanding the 3D geometry that relates pixels between two views is a crucial step towards building a better model. Given the complexity of handling various type… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.08131v2-abstract-full').style.display = 'inline'; document.getElementById('2201.08131v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.08131v2-abstract-full" style="display: none;"> Reference-guided image inpainting restores image pixels by leveraging the content from another single reference image. The primary challenge is how to precisely place the pixels from the reference image into the hole region. Therefore, understanding the 3D geometry that relates pixels between two views is a crucial step towards building a better model. Given the complexity of handling various types of reference images, we focus on the scenario where the images are captured by freely moving the same camera around. Compared to the previous work, we propose a principled approach that does not make heuristic assumptions about the planarity of the scene. We leverage a monocular depth estimate and predict relative pose between cameras, then align the reference image to the target by a differentiable 3D reprojection and a joint optimization of relative pose and depth map scale and offset. Our approach achieves state-of-the-art performance on both RealEstate10K and MannequinChallenge dataset with large baselines, complex geometry and extreme camera motions. We experimentally verify our approach is also better at handling large holes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.08131v2-abstract-full').style.display = 'none'; document.getElementById('2201.08131v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WACV 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.11427">arXiv:2112.11427</a> <span> [<a href="https://arxiv.org/pdf/2112.11427">pdf</a>, <a href="https://arxiv.org/format/2112.11427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> StyleSDF: High-Resolution 3D-Consistent Image and Geometry Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Or-El%2C+R">Roy Or-El</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xuan Luo</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+M">Mengyi Shan</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J+J">Jeong Joon Park</a>, <a href="/search/cs?searchtype=author&query=Kemelmacher-Shlizerman%2C+I">Ira Kemelmacher-Shlizerman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.11427v2-abstract-short" style="display: inline;"> We introduce a high resolution, 3D-consistent image and shape generation technique which we call StyleSDF. Our method is trained on single-view RGB data only, and stands on the shoulders of StyleGAN2 for image generation, while solving two main challenges in 3D-aware GANs: 1) high-resolution, view-consistent generation of the RGB images, and 2) detailed 3D shape. We achieve this by merging a SDF-b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.11427v2-abstract-full').style.display = 'inline'; document.getElementById('2112.11427v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.11427v2-abstract-full" style="display: none;"> We introduce a high resolution, 3D-consistent image and shape generation technique which we call StyleSDF. Our method is trained on single-view RGB data only, and stands on the shoulders of StyleGAN2 for image generation, while solving two main challenges in 3D-aware GANs: 1) high-resolution, view-consistent generation of the RGB images, and 2) detailed 3D shape. We achieve this by merging a SDF-based 3D representation with a style-based 2D generator. Our 3D implicit network renders low-resolution feature maps, from which the style-based network generates view-consistent, 1024x1024 images. Notably, our SDF-based 3D modeling defines detailed 3D surfaces, leading to consistent volume rendering. Our method shows higher quality results compared to state of the art in terms of visual and geometric quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.11427v2-abstract-full').style.display = 'none'; document.getElementById('2112.11427v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Camera-Ready version. Paper was accepted as oral to CVPR 2022. Added discussions and figures from the rebuttal to the supplementary material (sections C & F). Project Webpage: https://stylesdf.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.09130">arXiv:2112.09130</a> <span> [<a href="https://arxiv.org/pdf/2112.09130">pdf</a>, <a href="https://arxiv.org/format/2112.09130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Ensembling Off-the-shelf Models for GAN Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumari%2C+N">Nupur Kumari</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun-Yan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.09130v3-abstract-short" style="display: inline;"> The advent of large-scale training has produced a cornucopia of powerful visual recognition models. However, generative models, such as GANs, have traditionally been trained from scratch in an unsupervised manner. Can the collective "knowledge" from a large bank of pretrained vision models be leveraged to improve GAN training? If so, with so many models to choose from, which one(s) should be selec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.09130v3-abstract-full').style.display = 'inline'; document.getElementById('2112.09130v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.09130v3-abstract-full" style="display: none;"> The advent of large-scale training has produced a cornucopia of powerful visual recognition models. However, generative models, such as GANs, have traditionally been trained from scratch in an unsupervised manner. Can the collective "knowledge" from a large bank of pretrained vision models be leveraged to improve GAN training? If so, with so many models to choose from, which one(s) should be selected, and in what manner are they most effective? We find that pretrained computer vision models can significantly improve performance when used in an ensemble of discriminators. Notably, the particular subset of selected models greatly affects performance. We propose an effective selection mechanism, by probing the linear separability between real and fake samples in pretrained model embeddings, choosing the most accurate model, and progressively adding it to the discriminator ensemble. Interestingly, our method can improve GAN training in both limited data and large-scale settings. Given only 10k training samples, our FID on LSUN Cat matches the StyleGAN2 trained on 1.6M images. On the full dataset, our method improves FID by 1.5x to 2x on cat, church, and horse categories of LSUN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.09130v3-abstract-full').style.display = 'none'; document.getElementById('2112.09130v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2022 (Oral). GitHub: https://github.com/nupurkmr9/vision-aided-gan Project webpage: https://www.cs.cmu.edu/~vision-aided-gan/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.05143">arXiv:2112.05143</a> <span> [<a href="https://arxiv.org/pdf/2112.05143">pdf</a>, <a href="https://arxiv.org/format/2112.05143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GAN-Supervised Dense Visual Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peebles%2C+W">William Peebles</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun-Yan Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richard Zhang</a>, <a href="/search/cs?searchtype=author&query=Torralba%2C+A">Antonio Torralba</a>, <a href="/search/cs?searchtype=author&query=Efros%2C+A+A">Alexei A. Efros</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.05143v2-abstract-short" style="display: inline;"> We propose GAN-Supervised Learning, a framework for learning discriminative models and their GAN-generated training data jointly end-to-end. We apply our framework to the dense visual alignment problem. Inspired by the classic Congealing method, our GANgealing algorithm trains a Spatial Transformer to map random samples from a GAN trained on unaligned data to a common, jointly-learned target mode.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.05143v2-abstract-full').style.display = 'inline'; document.getElementById('2112.05143v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.05143v2-abstract-full" style="display: none;"> We propose GAN-Supervised Learning, a framework for learning discriminative models and their GAN-generated training data jointly end-to-end. We apply our framework to the dense visual alignment problem. Inspired by the classic Congealing method, our GANgealing algorithm trains a Spatial Transformer to map random samples from a GAN trained on unaligned data to a common, jointly-learned target mode. We show results on eight datasets, all of which demonstrate our method successfully aligns complex data and discovers dense correspondences. GANgealing significantly outperforms past self-supervised correspondence algorithms and performs on-par with (and sometimes exceeds) state-of-the-art supervised correspondence algorithms on several datasets -- without making use of any correspondence supervision or data augmentation and despite being trained exclusively on GAN-generated data. For precise correspondence, we improve upon state-of-the-art supervised methods by as much as $3\times$. We show applications of our method for augmented reality, image editing and automated pre-processing of image datasets for downstream GAN training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.05143v2-abstract-full').style.display = 'none'; document.getElementById('2112.05143v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">An updated version of our CVPR 2022 paper (oral); v2 features additional references and minor text changes. Code available at https://www.github.com/wpeebles/gangealing . Project page and videos available at https://www.wpeebles.com/gangealing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.11323">arXiv:2110.11323</a> <span> [<a href="https://arxiv.org/pdf/2110.11323">pdf</a>, <a href="https://arxiv.org/format/2110.11323">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> StyleAlign: Analysis and Applications of Aligned StyleGAN Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zongze Wu</a>, <a href="/search/cs?searchtype=author&query=Nitzan%2C+Y">Yotam Nitzan</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Lischinski%2C+D">Dani Lischinski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.11323v2-abstract-short" style="display: inline;"> In this paper, we perform an in-depth study of the properties and applications of aligned generative models. We refer to two models as aligned if they share the same architecture, and one of them (the child) is obtained from the other (the parent) via fine-tuning to another domain, a common practice in transfer learning. Several works already utilize some basic properties of aligned StyleGAN model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.11323v2-abstract-full').style.display = 'inline'; document.getElementById('2110.11323v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.11323v2-abstract-full" style="display: none;"> In this paper, we perform an in-depth study of the properties and applications of aligned generative models. We refer to two models as aligned if they share the same architecture, and one of them (the child) is obtained from the other (the parent) via fine-tuning to another domain, a common practice in transfer learning. Several works already utilize some basic properties of aligned StyleGAN models to perform image-to-image translation. Here, we perform the first detailed exploration of model alignment, also focusing on StyleGAN. First, we empirically analyze aligned models and provide answers to important questions regarding their nature. In particular, we find that the child model's latent spaces are semantically aligned with those of the parent, inheriting incredibly rich semantics, even for distant data domains such as human faces and churches. Second, equipped with this better understanding, we leverage aligned models to solve a diverse set of tasks. In addition to image translation, we demonstrate fully automatic cross-domain image morphing. We further show that zero-shot vision tasks may be performed in the child domain, while relying exclusively on supervision in the parent domain. We demonstrate qualitatively and quantitatively that our approach yields state-of-the-art results, while requiring only simple fine-tuning and inversion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.11323v2-abstract-full').style.display = 'none'; document.getElementById('2110.11323v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">44 pages, 37 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. 10th International Conference on Learning Representations, ICLR 2022 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.10501">arXiv:2110.10501</a> <span> [<a href="https://arxiv.org/pdf/2110.10501">pdf</a>, <a href="https://arxiv.org/format/2110.10501">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> STALP: Style Transfer with Auxiliary Limited Pairing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Futschik%2C+D">David Futschik</a>, <a href="/search/cs?searchtype=author&query=Ku%C4%8Dera%2C+M">Michal Ku膷era</a>, <a href="/search/cs?searchtype=author&query=Luk%C3%A1%C4%8D%2C+M">Michal Luk谩膷</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaowen Wang</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=S%C3%BDkora%2C+D">Daniel S媒kora</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.10501v1-abstract-short" style="display: inline;"> We present an approach to example-based stylization of images that uses a single pair of a source image and its stylized counterpart. We demonstrate how to train an image translation network that can perform real-time semantically meaningful style transfer to a set of target images with similar content as the source image. A key added value of our approach is that it considers also consistency of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.10501v1-abstract-full').style.display = 'inline'; document.getElementById('2110.10501v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.10501v1-abstract-full" style="display: none;"> We present an approach to example-based stylization of images that uses a single pair of a source image and its stylized counterpart. We demonstrate how to train an image translation network that can perform real-time semantically meaningful style transfer to a set of target images with similar content as the source image. A key added value of our approach is that it considers also consistency of target images during training. Although those have no stylized counterparts, we constrain the translation to keep the statistics of neural responses compatible with those extracted from the stylized source. In contrast to concurrent techniques that use a similar input, our approach better preserves important visual characteristics of the source style and can deliver temporally stable results without the need to explicitly handle temporal consistency. We demonstrate its practical utility on various applications including video stylization, style transfer to panoramas, faces, and 3D models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.10501v1-abstract-full').style.display = 'none'; document.getElementById('2110.10501v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Eurographics 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.06269">arXiv:2110.06269</a> <span> [<a href="https://arxiv.org/pdf/2110.06269">pdf</a>, <a href="https://arxiv.org/format/2110.06269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Real Image Inversion via Segments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Futschik%2C+D">David Futschik</a>, <a href="/search/cs?searchtype=author&query=Luk%C3%A1%C4%8D%2C+M">Michal Luk谩膷</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=S%C3%BDkora%2C+D">Daniel S媒kora</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.06269v1-abstract-short" style="display: inline;"> In this short report, we present a simple, yet effective approach to editing real images via generative adversarial networks (GAN). Unlike previous techniques, that treat all editing tasks as an operation that affects pixel values in the entire image in our approach we cut up the image into a set of smaller segments. For those segments corresponding latent codes of a generative network can be esti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.06269v1-abstract-full').style.display = 'inline'; document.getElementById('2110.06269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.06269v1-abstract-full" style="display: none;"> In this short report, we present a simple, yet effective approach to editing real images via generative adversarial networks (GAN). Unlike previous techniques, that treat all editing tasks as an operation that affects pixel values in the entire image in our approach we cut up the image into a set of smaller segments. For those segments corresponding latent codes of a generative network can be estimated with greater accuracy due to the lower number of constraints. When codes are altered by the user the content in the image is manipulated locally while the rest of it remains unaffected. Thanks to this property the final edited image better retains the original structures and thus helps to preserve natural look. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.06269v1-abstract-full').style.display = 'none'; document.getElementById('2110.06269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.04281">arXiv:2110.04281</a> <span> [<a href="https://arxiv.org/pdf/2110.04281">pdf</a>, <a href="https://arxiv.org/format/2110.04281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Collaging Class-specific GANs for Semantic Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuheng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijun Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jingwan Lu</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y+J">Yong Jae Lee</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+K+K">Krishna Kumar Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.04281v1-abstract-short" style="display: inline;"> We propose a new approach for high resolution semantic image synthesis. It consists of one base image generator and multiple class-specific generators. The base generator generates high quality images based on a segmentation map. To further improve the quality of different objects, we create a bank of Generative Adversarial Networks (GANs) by separately training class-specific models. This has sev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.04281v1-abstract-full').style.display = 'inline'; document.getElementById('2110.04281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.04281v1-abstract-full" style="display: none;"> We propose a new approach for high resolution semantic image synthesis. It consists of one base image generator and multiple class-specific generators. The base generator generates high quality images based on a segmentation map. To further improve the quality of different objects, we create a bank of Generative Adversarial Networks (GANs) by separately training class-specific models. This has several benefits including -- dedicated weights for each class; centrally aligned data for each model; additional training data from other sources, potential of higher resolution and quality; and easy manipulation of a specific object in the scene. Experiments show that our approach can generate high quality images in high resolution while having flexibility of object-level control by using class-specific generators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.04281v1-abstract-full').style.display = 'none'; document.getElementById('2110.04281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.06166">arXiv:2109.06166</a> <span> [<a href="https://arxiv.org/pdf/2109.06166">pdf</a>, <a href="https://arxiv.org/format/2109.06166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pose with Style: Detail-Preserving Pose-Guided Image Synthesis with Conditional StyleGAN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=AlBahar%2C+B">Badour AlBahar</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jingwan Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jimei Yang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Z">Zhixin Shu</a>, <a href="/search/cs?searchtype=author&query=Shechtman%2C+E">Eli Shechtman</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jia-Bin Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.06166v1-abstract-short" style="display: inline;"> We present an algorithm for re-rendering a person from a single image under arbitrary poses. Existing methods often have difficulties in hallucinating occluded contents photo-realistically while preserving the identity and fine details in the source image. We first learn to inpaint the correspondence field between the body surface texture and the source image with a human body symmetry prior. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.06166v1-abstract-full').style.display = 'inline'; document.getElementById('2109.06166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.06166v1-abstract-full" style="display: none;"> We present an algorithm for re-rendering a person from a single image under arbitrary poses. Existing methods often have difficulties in hallucinating occluded contents photo-realistically while preserving the identity and fine details in the source image. We first learn to inpaint the correspondence field between the body surface texture and the source image with a human body symmetry prior. The inpainted correspondence field allows us to transfer/warp local features extracted from the source to the target view even under large pose changes. Directly mapping the warped local features to an RGB image using a simple CNN decoder often leads to visible artifacts. Thus, we extend the StyleGAN generator so that it takes pose as input (for controlling poses) and introduces a spatially varying modulation for the latent space using the warped local features (for controlling appearances). We show that our method compares favorably against the state-of-the-art algorithms in both quantitative evaluation and visual comparison. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.06166v1-abstract-full').style.display = 'none'; document.getElementById('2109.06166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SIGGRAPH Asia 2021. Project page: https://pose-with-style.github.io/</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Shechtman%2C+E&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Shechtman%2C+E&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Shechtman%2C+E&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository