CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 56 results for author: <span class="mathjax">Belilovsky, E</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Belilovsky%2C+E">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Belilovsky, E"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Belilovsky%2C+E&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Belilovsky, E"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Belilovsky%2C+E&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Belilovsky%2C+E&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Belilovsky%2C+E&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12831">arXiv:2411.12831</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12831">pdf</a>, <a href="https://arxiv.org/format/2411.12831">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards motion from video diffusion models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Janson%2C+P">Paul Janson</a>, <a href="/search/cs?searchtype=author&amp;query=Popa%2C+T">Tiberiu Popa</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12831v1-abstract-short" style="display: inline;"> Text-conditioned video diffusion models have emerged as a powerful tool in the realm of video generation and editing. But their ability to capture the nuances of human movement remains under-explored. Indeed the ability of these models to faithfully model an array of text prompts can lead to a wide host of applications in human and character animation. In this work, we take initial steps to invest&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12831v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12831v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12831v1-abstract-full" style="display: none;"> Text-conditioned video diffusion models have emerged as a powerful tool in the realm of video generation and editing. But their ability to capture the nuances of human movement remains under-explored. Indeed the ability of these models to faithfully model an array of text prompts can lead to a wide host of applications in human and character animation. In this work, we take initial steps to investigate whether these models can effectively guide the synthesis of realistic human body animations. Specifically we propose to synthesize human motion by deforming an SMPL-X body representation guided by Score distillation sampling (SDS) calculated using a video diffusion model. By analyzing the fidelity of the resulting animations, we gain insights into the extent to which we can obtain motion using publicly available text-to-video diffusion models using SDS. Our findings shed light on the potential and limitations of these models for generating diverse and plausible human motions, paving the way for further research in this exciting area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12831v1-abstract-full').style.display = 'none'; document.getElementById('2411.12831v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ECCV 2024 Workshop :Foundation Models for 3D Humans</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12168">arXiv:2411.12168</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12168">pdf</a>, <a href="https://arxiv.org/format/2411.12168">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Sketch-guided Cage-based 3D Gaussian Splatting Deformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xie%2C+T">Tianhao Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Aigerman%2C+N">Noam Aigerman</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Popa%2C+T">Tiberiu Popa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12168v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (GS) is one of the most promising novel 3D representations that has received great interest in computer graphics and computer vision. While various systems have introduced editing capabilities for 3D GS, such as those guided by text prompts, fine-grained control over deformation remains an open challenge. In this work, we present a novel sketch-guided 3D GS deformation system&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12168v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12168v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12168v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (GS) is one of the most promising novel 3D representations that has received great interest in computer graphics and computer vision. While various systems have introduced editing capabilities for 3D GS, such as those guided by text prompts, fine-grained control over deformation remains an open challenge. In this work, we present a novel sketch-guided 3D GS deformation system that allows users to intuitively modify the geometry of a 3D GS model by drawing a silhouette sketch from a single viewpoint. Our approach introduces a new deformation method that combines cage-based deformations with a variant of Neural Jacobian Fields, enabling precise, fine-grained control. Additionally, it leverages large-scale 2D diffusion priors and ControlNet to ensure the generated deformations are semantically plausible. Through a series of experiments, we demonstrate the effectiveness of our method and showcase its ability to animate static 3D GS models as one of its key applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12168v1-abstract-full').style.display = 'none'; document.getElementById('2411.12168v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14637">arXiv:2409.14637</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.14637">pdf</a>, <a href="https://arxiv.org/format/2409.14637">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Not Only the Last-Layer Features for Spurious Correlations: All Layer Deep Feature Reweighting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hameed%2C+H+W">Humza Wajid Hameed</a>, <a href="/search/cs?searchtype=author&amp;query=Nanfack%2C+G">Geraldin Nanfack</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14637v1-abstract-short" style="display: inline;"> Spurious correlations are a major source of errors for machine learning models, in particular when aiming for group-level fairness. It has been recently shown that a powerful approach to combat spurious correlations is to re-train the last layer on a balanced validation dataset, isolating robust features for the predictor. However, key attributes can sometimes be discarded by neural networks towar&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14637v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14637v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14637v1-abstract-full" style="display: none;"> Spurious correlations are a major source of errors for machine learning models, in particular when aiming for group-level fairness. It has been recently shown that a powerful approach to combat spurious correlations is to re-train the last layer on a balanced validation dataset, isolating robust features for the predictor. However, key attributes can sometimes be discarded by neural networks towards the last layer. In this work, we thus consider retraining a classifier on a set of features derived from all layers. We utilize a recently proposed feature selection strategy to select unbiased features from all the layers. We observe this approach gives significant improvements in worst-group accuracy on several standard benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14637v1-abstract-full').style.display = 'none'; document.getElementById('2409.14637v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04434">arXiv:2409.04434</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.04434">pdf</a>, <a href="https://arxiv.org/format/2409.04434">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Training with Neuron Interaction and Nowcasting Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Knyazev%2C+B">Boris Knyazev</a>, <a href="/search/cs?searchtype=author&amp;query=Moudgil%2C+A">Abhinav Moudgil</a>, <a href="/search/cs?searchtype=author&amp;query=Lajoie%2C+G">Guillaume Lajoie</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Lacoste-Julien%2C+S">Simon Lacoste-Julien</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04434v2-abstract-short" style="display: inline;"> Neural network training can be accelerated when a learnable update rule is used in lieu of classic adaptive optimizers (e.g. Adam). However, learnable update rules can be costly and unstable to train and use. Recently, Jang et al. (2023) proposed a simpler approach to accelerate training based on weight nowcaster networks (WNNs). In their approach, Adam is used for most of the optimization steps a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04434v2-abstract-full').style.display = 'inline'; document.getElementById('2409.04434v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04434v2-abstract-full" style="display: none;"> Neural network training can be accelerated when a learnable update rule is used in lieu of classic adaptive optimizers (e.g. Adam). However, learnable update rules can be costly and unstable to train and use. Recently, Jang et al. (2023) proposed a simpler approach to accelerate training based on weight nowcaster networks (WNNs). In their approach, Adam is used for most of the optimization steps and periodically, only every few steps, a WNN nowcasts (predicts near future) parameters. We improve WNNs by proposing neuron interaction and nowcasting (NiNo) networks. In contrast to WNNs, NiNo leverages neuron connectivity and graph neural networks to more accurately nowcast parameters. We further show that in some networks, such as Transformers, modeling neuron connectivity accurately is challenging. We address this and other limitations, which allows NiNo to accelerate Adam training by up to 50% in vision and language tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04434v2-abstract-full').style.display = 'none'; document.getElementById('2409.04434v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">added Llama3-based results and other updates, code is https://github.com/SamsungSAILMontreal/nino</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05385">arXiv:2407.05385</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.05385">pdf</a>, <a href="https://arxiv.org/format/2407.05385">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Harmony in Diversity: Merging Neural Networks with Canonical Correlation Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Horoi%2C+S">Stefan Horoi</a>, <a href="/search/cs?searchtype=author&amp;query=Camacho%2C+A+M+O">Albert Manuel Orozco Camacho</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+G">Guy Wolf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05385v1-abstract-short" style="display: inline;"> Combining the predictions of multiple trained models through ensembling is generally a good way to improve accuracy by leveraging the different learned features of the models, however it comes with high computational and storage costs. Model fusion, the act of merging multiple models into one by combining their parameters reduces these costs but doesn&#39;t work as well in practice. Indeed, neural net&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05385v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05385v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05385v1-abstract-full" style="display: none;"> Combining the predictions of multiple trained models through ensembling is generally a good way to improve accuracy by leveraging the different learned features of the models, however it comes with high computational and storage costs. Model fusion, the act of merging multiple models into one by combining their parameters reduces these costs but doesn&#39;t work as well in practice. Indeed, neural network loss landscapes are high-dimensional and non-convex and the minima found through learning are typically separated by high loss barriers. Numerous recent works have been focused on finding permutations matching one network features to the features of a second one, lowering the loss barrier on the linear path between them in parameter space. However, permutations are restrictive since they assume a one-to-one mapping between the different models&#39; neurons exists. We propose a new model merging algorithm, CCA Merge, which is based on Canonical Correlation Analysis and aims to maximize the correlations between linear combinations of the model features. We show that our alignment method leads to better performances than past methods when averaging models trained on the same, or differing data splits. We also extend this analysis into the harder setting where more than 2 models are merged, and we find that CCA Merge works significantly better than past methods. Our code is publicly available at https://github.com/shoroi/align-n-merge <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05385v1-abstract-full').style.display = 'none'; document.getElementById('2407.05385v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proceedings of the Forty-first International Conference on Machine Learning (ICML 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13653">arXiv:2406.13653</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13653">pdf</a>, <a href="https://arxiv.org/format/2406.13653">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Controlling Forgetting with Test-Time Data in Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Singh%2C+V">Vaibhav Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Aljundi%2C+R">Rahaf Aljundi</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13653v1-abstract-short" style="display: inline;"> Foundational vision-language models have shown impressive performance on various downstream tasks. Yet, there is still a pressing need to update these models later as new tasks or domains become available. Ongoing Continual Learning (CL) research provides techniques to overcome catastrophic forgetting of previous information when new knowledge is acquired. To date, CL techniques focus only on the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13653v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13653v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13653v1-abstract-full" style="display: none;"> Foundational vision-language models have shown impressive performance on various downstream tasks. Yet, there is still a pressing need to update these models later as new tasks or domains become available. Ongoing Continual Learning (CL) research provides techniques to overcome catastrophic forgetting of previous information when new knowledge is acquired. To date, CL techniques focus only on the supervised training sessions. This results in significant forgetting yielding inferior performance to even the prior model zero shot performance. In this work, we argue that test-time data hold great information that can be leveraged in a self supervised manner to refresh the model&#39;s memory of previous learned tasks and hence greatly reduce forgetting at no extra labelling cost. We study how unsupervised data can be employed online to improve models&#39; performance on prior tasks upon encountering representative samples. We propose a simple yet effective student-teacher model with gradient based sparse parameters updates and show significant performance improvements and reduction in forgetting, which could alleviate the role of an offline episodic memory/experience replay buffer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13653v1-abstract-full').style.display = 'none'; document.getElementById('2406.13653v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02613">arXiv:2406.02613</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.02613">pdf</a>, <a href="https://arxiv.org/format/2406.02613">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ACCO: Accumulate while you Communicate, Hiding Communications in Distributed LLM Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nabli%2C+A">Adel Nabli</a>, <a href="/search/cs?searchtype=author&amp;query=Fournier%2C+L">Louis Fournier</a>, <a href="/search/cs?searchtype=author&amp;query=Erbacher%2C+P">Pierre Erbacher</a>, <a href="/search/cs?searchtype=author&amp;query=Serrano%2C+L">Louis Serrano</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02613v1-abstract-short" style="display: inline;"> Training Large Language Models (LLMs) relies heavily on distributed implementations, employing multiple GPUs to compute stochastic gradients on model replicas in parallel. However, synchronizing gradients in data parallel settings induces a communication overhead increasing with the number of distributed workers, which can impede the efficiency gains of parallelization. To address this challenge,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02613v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02613v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02613v1-abstract-full" style="display: none;"> Training Large Language Models (LLMs) relies heavily on distributed implementations, employing multiple GPUs to compute stochastic gradients on model replicas in parallel. However, synchronizing gradients in data parallel settings induces a communication overhead increasing with the number of distributed workers, which can impede the efficiency gains of parallelization. To address this challenge, optimization algorithms reducing inter-worker communication have emerged, such as local optimization methods used in Federated Learning. While effective in minimizing communication overhead, these methods incur significant memory costs, hindering scalability: in addition to extra momentum variables, if communications are only allowed between multiple local optimization steps, then the optimizer&#39;s states cannot be sharded among workers. In response, we propose $\textbf{AC}$cumulate while $\textbf{CO}$mmunicate ($\texttt{ACCO}$), a memory-efficient optimization algorithm tailored for distributed training of LLMs. $\texttt{ACCO}$ allows to shard optimizer states across workers, overlaps gradient computations and communications to conceal communication costs, and accommodates heterogeneous hardware. Our method relies on a novel technique to mitigate the one-step delay inherent in parallel execution of gradient computations and communications, eliminating the need for warmup steps and aligning with the training dynamics of standard distributed optimization while converging faster in terms of wall-clock time. We demonstrate the effectiveness of $\texttt{ACCO}$ on several LLMs training and fine-tuning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02613v1-abstract-full').style.display = 'none'; document.getElementById('2406.02613v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02052">arXiv:2406.02052</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.02052">pdf</a>, <a href="https://arxiv.org/format/2406.02052">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> PETRA: Parallel End-to-end Training with Reversible Architectures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rivaud%2C+S">St茅phane Rivaud</a>, <a href="/search/cs?searchtype=author&amp;query=Fournier%2C+L">Louis Fournier</a>, <a href="/search/cs?searchtype=author&amp;query=Pumir%2C+T">Thomas Pumir</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Eickenberg%2C+M">Michael Eickenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02052v1-abstract-short" style="display: inline;"> Reversible architectures have been shown to be capable of performing on par with their non-reversible architectures, being applied in deep learning for memory savings and generative modeling. In this work, we show how reversible architectures can solve challenges in parallelizing deep model training. We introduce PETRA, a novel alternative to backpropagation for parallelizing gradient computations&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02052v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02052v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02052v1-abstract-full" style="display: none;"> Reversible architectures have been shown to be capable of performing on par with their non-reversible architectures, being applied in deep learning for memory savings and generative modeling. In this work, we show how reversible architectures can solve challenges in parallelizing deep model training. We introduce PETRA, a novel alternative to backpropagation for parallelizing gradient computations. PETRA facilitates effective model parallelism by enabling stages (i.e., a set of layers) to compute independently on different devices, while only needing to communicate activations and gradients between each other. By decoupling the forward and backward passes and keeping a single updated version of the parameters, the need for weight stashing is also removed. We develop a custom autograd-like training framework for PETRA, and we demonstrate its effectiveness on CIFAR-10, ImageNet32, and ImageNet, achieving competitive accuracies comparable to backpropagation using ResNet-18, ResNet-34, and ResNet-50 models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02052v1-abstract-full').style.display = 'none'; document.getElementById('2406.02052v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01365">arXiv:2406.01365</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.01365">pdf</a>, <a href="https://arxiv.org/format/2406.01365">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> From Feature Visualization to Visual Circuits: Effect of Adversarial Model Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nanfack%2C+G">Geraldin Nanfack</a>, <a href="/search/cs?searchtype=author&amp;query=Eickenberg%2C+M">Michael Eickenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01365v1-abstract-short" style="display: inline;"> Understanding the inner working functionality of large-scale deep neural networks is challenging yet crucial in several high-stakes applications. Mechanistic inter- pretability is an emergent field that tackles this challenge, often by identifying human-understandable subgraphs in deep neural networks known as circuits. In vision-pretrained models, these subgraphs are usually interpreted by visual&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01365v1-abstract-full').style.display = 'inline'; document.getElementById('2406.01365v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01365v1-abstract-full" style="display: none;"> Understanding the inner working functionality of large-scale deep neural networks is challenging yet crucial in several high-stakes applications. Mechanistic inter- pretability is an emergent field that tackles this challenge, often by identifying human-understandable subgraphs in deep neural networks known as circuits. In vision-pretrained models, these subgraphs are usually interpreted by visualizing their node features through a popular technique called feature visualization. Recent works have analyzed the stability of different feature visualization types under the adversarial model manipulation framework. This paper starts by addressing limitations in existing works by proposing a novel attack called ProxPulse that simultaneously manipulates the two types of feature visualizations. Surprisingly, when analyzing these attacks under the umbrella of visual circuits, we find that visual circuits show some robustness to ProxPulse. We, therefore, introduce a new attack based on ProxPulse that unveils the manipulability of visual circuits, shedding light on their lack of robustness. The effectiveness of these attacks is validated using pre-trained AlexNet and ResNet-50 models on ImageNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01365v1-abstract-full').style.display = 'none'; document.getElementById('2406.01365v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00272">arXiv:2406.00272</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.00272">pdf</a>, <a href="https://arxiv.org/format/2406.00272">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Temporally Consistent Object Editing in Videos using Extended Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zamani%2C+A">AmirHossein Zamani</a>, <a href="/search/cs?searchtype=author&amp;query=Aghdam%2C+A+G">Amir G. Aghdam</a>, <a href="/search/cs?searchtype=author&amp;query=Popa%2C+T">Tiberiu Popa</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00272v1-abstract-short" style="display: inline;"> Image generation and editing have seen a great deal of advancements with the rise of large-scale diffusion models that allow user control of different modalities such as text, mask, depth maps, etc. However, controlled editing of videos still lags behind. Prior work in this area has focused on using 2D diffusion models to globally change the style of an existing video. On the other hand, in many p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00272v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00272v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00272v1-abstract-full" style="display: none;"> Image generation and editing have seen a great deal of advancements with the rise of large-scale diffusion models that allow user control of different modalities such as text, mask, depth maps, etc. However, controlled editing of videos still lags behind. Prior work in this area has focused on using 2D diffusion models to globally change the style of an existing video. On the other hand, in many practical applications, editing localized parts of the video is critical. In this work, we propose a method to edit videos using a pre-trained inpainting image diffusion model. We systematically redesign the forward path of the model by replacing the self-attention modules with an extended version of attention modules that creates frame-level dependencies. In this way, we ensure that the edited information will be consistent across all the video frames no matter what the shape and position of the masked area is. We qualitatively compare our results with state-of-the-art in terms of accuracy on several video editing tasks like object retargeting, object replacement, and object removal tasks. Simulations demonstrate the superior performance of the proposed strategy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00272v1-abstract-full').style.display = 'none'; document.getElementById('2406.00272v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00153">arXiv:2406.00153</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.00153">pdf</a>, <a href="https://arxiv.org/format/2406.00153">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> $渭$LO: Compute-Efficient Meta-Generalization of Learned Optimizers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Th%C3%A9rien%2C+B">Benjamin Th茅rien</a>, <a href="/search/cs?searchtype=author&amp;query=Joseph%2C+C">Charles-脡tienne Joseph</a>, <a href="/search/cs?searchtype=author&amp;query=Knyazev%2C+B">Boris Knyazev</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a>, <a href="/search/cs?searchtype=author&amp;query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00153v2-abstract-short" style="display: inline;"> Learned optimizers (LOs) can significantly reduce the wall-clock training time of neural networks, substantially reducing training costs. However, they can struggle to optimize unseen tasks (meta-generalize), especially when training networks much larger than those seen during meta-training. To address this, we derive the Maximal Update Parametrization ($渭$P) for two popular learned optimizer arch&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00153v2-abstract-full').style.display = 'inline'; document.getElementById('2406.00153v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00153v2-abstract-full" style="display: none;"> Learned optimizers (LOs) can significantly reduce the wall-clock training time of neural networks, substantially reducing training costs. However, they can struggle to optimize unseen tasks (meta-generalize), especially when training networks much larger than those seen during meta-training. To address this, we derive the Maximal Update Parametrization ($渭$P) for two popular learned optimizer architectures and propose a simple meta-training recipe for $渭$-parameterized LOs ($渭$LOs). Our empirical evaluation demonstrates that LOs meta-trained with our recipe substantially improve meta-generalization to wider unseen tasks when compared to LOs trained under standard parametrization (e.g., as they are trained in existing work). When applying our $渭$LOs, each trained for less than 250 GPU-hours, to large-width models we are often able to match or exceed the performance of pre-trained VeLO, the most performant publicly available learned optimizer, meta-trained with 4000 TPU-months of compute. We also observe that learned optimizers trained with our $渭$LO recipe also exhibit substantially improved meta-generalization to deeper networks ($5\times$ meta-training) and remarkable generalization to much longer training horizons ($25\times$ meta-training). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00153v2-abstract-full').style.display = 'none'; document.getElementById('2406.00153v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17517">arXiv:2405.17517</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.17517">pdf</a>, <a href="https://arxiv.org/format/2405.17517">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> WASH: Train your Ensemble with Communication-Efficient Weight Shuffling, then Average </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fournier%2C+L">Louis Fournier</a>, <a href="/search/cs?searchtype=author&amp;query=Nabli%2C+A">Adel Nabli</a>, <a href="/search/cs?searchtype=author&amp;query=Aminbeidokhti%2C+M">Masih Aminbeidokhti</a>, <a href="/search/cs?searchtype=author&amp;query=Pedersoli%2C+M">Marco Pedersoli</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17517v1-abstract-short" style="display: inline;"> The performance of deep neural networks is enhanced by ensemble methods, which average the output of several models. However, this comes at an increased cost at inference. Weight averaging methods aim at balancing the generalization of ensembling and the inference speed of a single model by averaging the parameters of an ensemble of models. Yet, naive averaging results in poor performance as model&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17517v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17517v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17517v1-abstract-full" style="display: none;"> The performance of deep neural networks is enhanced by ensemble methods, which average the output of several models. However, this comes at an increased cost at inference. Weight averaging methods aim at balancing the generalization of ensembling and the inference speed of a single model by averaging the parameters of an ensemble of models. Yet, naive averaging results in poor performance as models converge to different loss basins, and aligning the models to improve the performance of the average is challenging. Alternatively, inspired by distributed training, methods like DART and PAPA have been proposed to train several models in parallel such that they will end up in the same basin, resulting in good averaging accuracy. However, these methods either compromise ensembling accuracy or demand significant communication between models during training. In this paper, we introduce WASH, a novel distributed method for training model ensembles for weight averaging that achieves state-of-the-art image classification accuracy. WASH maintains models within the same basin by randomly shuffling a small percentage of weights during training, resulting in diverse models and lower communication costs compared to standard parameter averaging methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17517v1-abstract-full').style.display = 'none'; document.getElementById('2405.17517v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16397">arXiv:2405.16397</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.16397">pdf</a>, <a href="https://arxiv.org/format/2405.16397">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> AdaFisher: Adaptive Second Order Optimization via Fisher Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gomes%2C+D+M">Damien Martins Gomes</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yanlei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&amp;query=Hosseini%2C+M+S">Mahdi S. Hosseini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16397v2-abstract-short" style="display: inline;"> First-order optimization methods are currently the mainstream in training deep neural networks (DNNs). Optimizers like Adam incorporate limited curvature information by employing the diagonal matrix preconditioning of the stochastic gradient during the training. Despite their widespread, second-order optimization algorithms exhibit superior convergence properties compared to their first-order coun&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16397v2-abstract-full').style.display = 'inline'; document.getElementById('2405.16397v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16397v2-abstract-full" style="display: none;"> First-order optimization methods are currently the mainstream in training deep neural networks (DNNs). Optimizers like Adam incorporate limited curvature information by employing the diagonal matrix preconditioning of the stochastic gradient during the training. Despite their widespread, second-order optimization algorithms exhibit superior convergence properties compared to their first-order counterparts e.g. Adam and SGD. However, their practicality in training DNNs are still limited due to increased per-iteration computations and suboptimal accuracy compared to the first order methods. We present AdaFisher--an adaptive second-order optimizer that leverages a block-diagonal approximation to the Fisher information matrix for adaptive gradient preconditioning. AdaFisher aims to bridge the gap between enhanced convergence capabilities and computational efficiency in second-order optimization framework for training DNNs. Despite the slow pace of second-order optimizers, we showcase that AdaFisher can be reliably adopted for image classification, language modelling and stand out for its stability and robustness in hyperparameter tuning. We demonstrate that AdaFisher outperforms the SOTA optimizers in terms of both accuracy and convergence speed. Code is available from https://github.com/AtlasAnalyticsLab/AdaFisher. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16397v2-abstract-full').style.display = 'none'; document.getElementById('2405.16397v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08763">arXiv:2403.08763</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.08763">pdf</a>, <a href="https://arxiv.org/format/2403.08763">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Simple and Scalable Strategies to Continually Pre-train Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ibrahim%2C+A">Adam Ibrahim</a>, <a href="/search/cs?searchtype=author&amp;query=Th%C3%A9rien%2C+B">Benjamin Th茅rien</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+K">Kshitij Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Richter%2C+M+L">Mats L. Richter</a>, <a href="/search/cs?searchtype=author&amp;query=Anthony%2C+Q">Quentin Anthony</a>, <a href="/search/cs?searchtype=author&amp;query=Lesort%2C+T">Timoth茅e Lesort</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08763v4-abstract-short" style="display: inline;"> Large language models (LLMs) are routinely pre-trained on billions of tokens, only to start the process over again once new data becomes available. A much more efficient solution is to continually pre-train these models, saving significant compute compared to re-training. However, the distribution shift induced by new data typically results in degraded performance on previous data or poor adaptati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08763v4-abstract-full').style.display = 'inline'; document.getElementById('2403.08763v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08763v4-abstract-full" style="display: none;"> Large language models (LLMs) are routinely pre-trained on billions of tokens, only to start the process over again once new data becomes available. A much more efficient solution is to continually pre-train these models, saving significant compute compared to re-training. However, the distribution shift induced by new data typically results in degraded performance on previous data or poor adaptation to the new data. In this work, we show that a simple and scalable combination of learning rate (LR) re-warming, LR re-decaying, and replay of previous data is sufficient to match the performance of fully re-training from scratch on all available data, as measured by the final loss and the average score on several language model (LM) evaluation benchmarks. Specifically, we show this for a weak but realistic distribution shift between two commonly used LLM pre-training datasets (English$\rightarrow$English) and a stronger distribution shift (English$\rightarrow$German) at the $405$M parameter model scale with large dataset sizes (hundreds of billions of tokens). Selecting the weak but realistic shift for larger-scale experiments, we also find that our continual learning strategies match the re-training baseline for a 10B parameter LLM. Our results demonstrate that LLMs can be successfully updated via simple and scalable continual learning strategies, matching the re-training baseline using only a fraction of the compute. Finally, inspired by previous work, we propose alternatives to the cosine learning rate schedule that help circumvent forgetting induced by LR re-warming and that are not bound to a fixed token budget. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08763v4-abstract-full').style.display = 'none'; document.getElementById('2403.08763v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.04958">arXiv:2402.04958</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.04958">pdf</a>, <a href="https://arxiv.org/format/2402.04958">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Channel-Selective Normalization for Label-Shift Robust Test-Time Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vianna%2C+P">Pedro Vianna</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhary%2C+M">Muawiz Chaudhary</a>, <a href="/search/cs?searchtype=author&amp;query=Mehrbod%2C+P">Paria Mehrbod</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+A">An Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Cloutier%2C+G">Guy Cloutier</a>, <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&amp;query=Eickenberg%2C+M">Michael Eickenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.04958v2-abstract-short" style="display: inline;"> Deep neural networks have useful applications in many different tasks, however their performance can be severely affected by changes in the data distribution. For example, in the biomedical field, their performance can be affected by changes in the data (different machines, populations) between training and test datasets. To ensure robustness and generalization to real-world scenarios, test-time a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04958v2-abstract-full').style.display = 'inline'; document.getElementById('2402.04958v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.04958v2-abstract-full" style="display: none;"> Deep neural networks have useful applications in many different tasks, however their performance can be severely affected by changes in the data distribution. For example, in the biomedical field, their performance can be affected by changes in the data (different machines, populations) between training and test datasets. To ensure robustness and generalization to real-world scenarios, test-time adaptation has been recently studied as an approach to adjust models to a new data distribution during inference. Test-time batch normalization is a simple and popular method that achieved compelling performance on domain shift benchmarks. It is implemented by recalculating batch normalization statistics on test batches. Prior work has focused on analysis with test data that has the same label distribution as the training data. However, in many practical applications this technique is vulnerable to label distribution shifts, sometimes producing catastrophic failure. This presents a risk in applying test time adaptation methods in deployment. We propose to tackle this challenge by only selectively adapting channels in a deep network, minimizing drastic adaptation that is sensitive to label shifts. Our selection scheme is based on two principles that we empirically motivate: (1) later layers of networks are more sensitive to label shift (2) individual features can be sensitive to specific classes. We apply the proposed technique to three classification tasks, including CIFAR10-C, Imagenet-C, and diagnosis of fatty liver, where we explore both covariate and label distribution shifts. We find that our method allows to bring the benefits of TTA while significantly reducing the risk of failure common in other methods, while being robust to choice in hyperparameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04958v2-abstract-full').style.display = 'none'; document.getElementById('2402.04958v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the Conference on Lifelong Learning Agents (CoLLAs) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.06795">arXiv:2312.06795</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.06795">pdf</a>, <a href="https://arxiv.org/format/2312.06795">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Model Breadcrumbs: Scaling Multi-Task Model Merging with Sparse Masks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Davari%2C+M">MohammadReza Davari</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.06795v2-abstract-short" style="display: inline;"> The rapid development of AI systems has been greatly influenced by the emergence of foundation models. A common approach for targeted problems involves fine-tuning these pre-trained foundation models for specific target tasks, resulting in a rapid spread of models fine-tuned across a diverse array of tasks. This work focuses on the problem of merging multiple fine-tunings of the same foundation mo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06795v2-abstract-full').style.display = 'inline'; document.getElementById('2312.06795v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.06795v2-abstract-full" style="display: none;"> The rapid development of AI systems has been greatly influenced by the emergence of foundation models. A common approach for targeted problems involves fine-tuning these pre-trained foundation models for specific target tasks, resulting in a rapid spread of models fine-tuned across a diverse array of tasks. This work focuses on the problem of merging multiple fine-tunings of the same foundation model derived from a spectrum of auxiliary tasks. We introduce a new simple method, Model Breadcrumbs, which consists of a sparsely defined weight set that guides model adaptation within the weight space of a pre-trained model. These breadcrumbs are constructed by subtracting the weights from a pre-trained model before and after fine-tuning, followed by a sparsification process that eliminates weight outliers and negligible perturbations. Our experiments demonstrate the effectiveness of Model Breadcrumbs to simultaneously improve performance across multiple tasks. This contribution aligns with the evolving paradigm of updatable machine learning, reminiscent of the collaborative principles underlying open-source software development, fostering a community-driven effort to reliably update machine learning models. Our method is shown to be more efficient and unlike previous proposals does not require hyperparameter tuning for each new task added. Through extensive experimentation involving various models, tasks, and modalities we establish that integrating Model Breadcrumbs offers a simple, efficient, and highly effective approach for constructing multi-task models and facilitating updates to foundation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06795v2-abstract-full').style.display = 'none'; document.getElementById('2312.06795v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.02204">arXiv:2312.02204</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.02204">pdf</a>, <a href="https://arxiv.org/format/2312.02204">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Can We Learn Communication-Efficient Optimizers? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Joseph%2C+C">Charles-脡tienne Joseph</a>, <a href="/search/cs?searchtype=author&amp;query=Th%C3%A9rien%2C+B">Benjamin Th茅rien</a>, <a href="/search/cs?searchtype=author&amp;query=Moudgil%2C+A">Abhinav Moudgil</a>, <a href="/search/cs?searchtype=author&amp;query=Knyazev%2C+B">Boris Knyazev</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.02204v1-abstract-short" style="display: inline;"> Communication-efficient variants of SGD, specifically local SGD, have received a great deal of interest in recent years. These approaches compute multiple gradient steps locally, that is on each worker, before averaging model parameters, helping relieve the critical communication bottleneck in distributed deep learning training. Although many variants of these approaches have been proposed, they c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02204v1-abstract-full').style.display = 'inline'; document.getElementById('2312.02204v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.02204v1-abstract-full" style="display: none;"> Communication-efficient variants of SGD, specifically local SGD, have received a great deal of interest in recent years. These approaches compute multiple gradient steps locally, that is on each worker, before averaging model parameters, helping relieve the critical communication bottleneck in distributed deep learning training. Although many variants of these approaches have been proposed, they can sometimes lag behind state-of-the-art adaptive optimizers for deep learning. In this work, we investigate if the recent progress in the emerging area of learned optimizers can potentially close this gap while remaining communication-efficient. Specifically, we meta-learn how to perform global updates given an update from local SGD iterations. Our results demonstrate that learned optimizers can substantially outperform local SGD and its sophisticated variants while maintaining their communication efficiency. Learned optimizers can even generalize to unseen and much larger datasets and architectures, including ImageNet and ViTs, and to unseen modalities such as language modeling. We therefore demonstrate the potential of learned optimizers for improving communication-efficient distributed learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02204v1-abstract-full').style.display = 'none'; document.getElementById('2312.02204v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.04561">arXiv:2310.04561</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.04561">pdf</a>, <a href="https://arxiv.org/format/2310.04561">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DragD3D: Realistic Mesh Editing with Rigidity Control Driven by 2D Diffusion Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xie%2C+T">Tianhao Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Mudur%2C+S">Sudhir Mudur</a>, <a href="/search/cs?searchtype=author&amp;query=Popa%2C+T">Tiberiu Popa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.04561v2-abstract-short" style="display: inline;"> Direct mesh editing and deformation are key components in the geometric modeling and animation pipeline. Mesh editing methods are typically framed as optimization problems combining user-specified vertex constraints with a regularizer that determines the position of the rest of the vertices. The choice of the regularizer is key to the realism and authenticity of the final result. Physics and geome&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04561v2-abstract-full').style.display = 'inline'; document.getElementById('2310.04561v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.04561v2-abstract-full" style="display: none;"> Direct mesh editing and deformation are key components in the geometric modeling and animation pipeline. Mesh editing methods are typically framed as optimization problems combining user-specified vertex constraints with a regularizer that determines the position of the rest of the vertices. The choice of the regularizer is key to the realism and authenticity of the final result. Physics and geometry-based regularizers are not aware of the global context and semantics of the object, and the more recent deep learning priors are limited to a specific class of 3D object deformations. Our main contribution is a vertex-based mesh editing method called DragD3D based on (1) a novel optimization formulation that decouples the rotation and stretch components of the deformation and combines a 3D geometric regularizer with (2) the recently introduced DDS loss which scores the faithfulness of the rendered 2D image to one from a diffusion model. Thus, our deformation method achieves globally realistic shape deformation which is not restricted to any class of objects. Our new formulation optimizes directly the transformation of the neural Jacobian field explicitly separating the rotational and stretching components. The objective function of the optimization combines the approximate gradients of DDS and the gradients from the geometric loss to satisfy the vertex constraints. Additional user control over desired global shape deformation is made possible by allowing explicit per-triangle deformation control as well as explicit separation of rotational and stretching components of the deformation. We show that our deformations can be controlled to yield realistic shape deformations that are aware of the global context of the objects, and provide better results than just using geometric regularizers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04561v2-abstract-full').style.display = 'none'; document.getElementById('2310.04561v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 8 figures, project page: https://tianhaoxie.github.io/project/DragD3D/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.04014">arXiv:2308.04014</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.04014">pdf</a>, <a href="https://arxiv.org/format/2308.04014">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Continual Pre-Training of Large Language Models: How to (re)warm your model? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+K">Kshitij Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Th%C3%A9rien%2C+B">Benjamin Th茅rien</a>, <a href="/search/cs?searchtype=author&amp;query=Ibrahim%2C+A">Adam Ibrahim</a>, <a href="/search/cs?searchtype=author&amp;query=Richter%2C+M+L">Mats L. Richter</a>, <a href="/search/cs?searchtype=author&amp;query=Anthony%2C+Q">Quentin Anthony</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&amp;query=Lesort%2C+T">Timoth茅e Lesort</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.04014v2-abstract-short" style="display: inline;"> Large language models (LLMs) are routinely pre-trained on billions of tokens, only to restart the process over again once new data becomes available. A much cheaper and more efficient solution would be to enable the continual pre-training of these models, i.e. updating pre-trained models with new data instead of re-training them from scratch. However, the distribution shift induced by novel data t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04014v2-abstract-full').style.display = 'inline'; document.getElementById('2308.04014v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.04014v2-abstract-full" style="display: none;"> Large language models (LLMs) are routinely pre-trained on billions of tokens, only to restart the process over again once new data becomes available. A much cheaper and more efficient solution would be to enable the continual pre-training of these models, i.e. updating pre-trained models with new data instead of re-training them from scratch. However, the distribution shift induced by novel data typically results in degraded performance on past data. Taking a step towards efficient continual pre-training, in this work, we examine the effect of different warm-up strategies. Our hypothesis is that the learning rate must be re-increased to improve compute efficiency when training on a new dataset. We study the warmup phase of models pre-trained on the Pile (upstream data, 300B tokens) as we continue to pre-train on SlimPajama (downstream data, 297B tokens), following a linear warmup and cosine decay schedule. We conduct all experiments on the Pythia 410M language model architecture and evaluate performance through validation perplexity. We experiment with different pre-training checkpoints, various maximum learning rates, and various warmup lengths. Our results show that while rewarming models first increases the loss on upstream and downstream data, in the longer run it improves the downstream performance, outperforming models trained from scratch$\unicode{x2013}$even for a large downstream dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04014v2-abstract-full').style.display = 'none'; document.getElementById('2308.04014v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.08289">arXiv:2306.08289</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.08289">pdf</a>, <a href="https://arxiv.org/format/2306.08289">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> $\textbf{A}^2\textbf{CiD}^2$: Accelerating Asynchronous Communication in Decentralized Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nabli%2C+A">Adel Nabli</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.08289v2-abstract-short" style="display: inline;"> Distributed training of Deep Learning models has been critical to many recent successes in the field. Current standard methods primarily rely on synchronous centralized algorithms which induce major communication bottlenecks and synchronization locks at scale. Decentralized asynchronous algorithms are emerging as a potential alternative but their practical applicability still lags. In order to mit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08289v2-abstract-full').style.display = 'inline'; document.getElementById('2306.08289v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.08289v2-abstract-full" style="display: none;"> Distributed training of Deep Learning models has been critical to many recent successes in the field. Current standard methods primarily rely on synchronous centralized algorithms which induce major communication bottlenecks and synchronization locks at scale. Decentralized asynchronous algorithms are emerging as a potential alternative but their practical applicability still lags. In order to mitigate the increase in communication cost that naturally comes with scaling the number of workers, we introduce a principled asynchronous, randomized, gossip-based optimization algorithm which works thanks to a continuous local momentum named $\textbf{A}^2\textbf{CiD}^2$. Our method allows each worker to continuously process mini-batches without stopping, and run a peer-to-peer averaging routine in parallel, reducing idle time. In addition to inducing a significant communication acceleration at no cost other than adding a local momentum variable, minimal adaptation is required to incorporate $\textbf{A}^2\textbf{CiD}^2$ to standard asynchronous approaches. Our theoretical analysis proves accelerated rates compared to previous asynchronous decentralized baselines and we empirically show that using our $\textbf{A}^2\textbf{CiD}^2$ momentum significantly decrease communication costs in poorly connected networks. In particular, we show consistent improvement on the ImageNet dataset using up to 64 asynchronous workers (A100 GPUs) and various communication network topologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08289v2-abstract-full').style.display = 'none'; document.getElementById('2306.08289v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Thirty-seventh Conference on Neural Information Processing Systems, Dec 2023, New Orleans, United States </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.07397">arXiv:2306.07397</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.07397">pdf</a>, <a href="https://arxiv.org/format/2306.07397">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Attacks on the Interpretation of Neuron Activation Maximization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nanfack%2C+G">Geraldin Nanfack</a>, <a href="/search/cs?searchtype=author&amp;query=Fulleringer%2C+A">Alexander Fulleringer</a>, <a href="/search/cs?searchtype=author&amp;query=Marty%2C+J">Jonathan Marty</a>, <a href="/search/cs?searchtype=author&amp;query=Eickenberg%2C+M">Michael Eickenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.07397v1-abstract-short" style="display: inline;"> The internal functional behavior of trained Deep Neural Networks is notoriously difficult to interpret. Activation-maximization approaches are one set of techniques used to interpret and analyze trained deep-learning models. These consist in finding inputs that maximally activate a given neuron or feature map. These inputs can be selected from a data set or obtained by optimization. However, inter&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.07397v1-abstract-full').style.display = 'inline'; document.getElementById('2306.07397v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.07397v1-abstract-full" style="display: none;"> The internal functional behavior of trained Deep Neural Networks is notoriously difficult to interpret. Activation-maximization approaches are one set of techniques used to interpret and analyze trained deep-learning models. These consist in finding inputs that maximally activate a given neuron or feature map. These inputs can be selected from a data set or obtained by optimization. However, interpretability methods may be subject to being deceived. In this work, we consider the concept of an adversary manipulating a model for the purpose of deceiving the interpretation. We propose an optimization framework for performing this manipulation and demonstrate a number of ways that popular activation-maximization interpretation techniques associated with CNNs can be manipulated to change the interpretations, shedding light on the reliability of these methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.07397v1-abstract-full').style.display = 'none'; document.getElementById('2306.07397v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.06968">arXiv:2306.06968</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.06968">pdf</a>, <a href="https://arxiv.org/format/2306.06968">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Can Forward Gradient Match Backpropagation? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fournier%2C+L">Louis Fournier</a>, <a href="/search/cs?searchtype=author&amp;query=Rivaud%2C+S">St茅phane Rivaud</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Eickenberg%2C+M">Michael Eickenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.06968v1-abstract-short" style="display: inline;"> Forward Gradients - the idea of using directional derivatives in forward differentiation mode - have recently been shown to be utilizable for neural network training while avoiding problems generally associated with backpropagation gradient computation, such as locking and memorization requirements. The cost is the requirement to guess the step direction, which is hard in high dimensions. While c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06968v1-abstract-full').style.display = 'inline'; document.getElementById('2306.06968v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.06968v1-abstract-full" style="display: none;"> Forward Gradients - the idea of using directional derivatives in forward differentiation mode - have recently been shown to be utilizable for neural network training while avoiding problems generally associated with backpropagation gradient computation, such as locking and memorization requirements. The cost is the requirement to guess the step direction, which is hard in high dimensions. While current solutions rely on weighted averages over isotropic guess vector distributions, we propose to strongly bias our gradient guesses in directions that are much more promising, such as feedback obtained from small, local auxiliary networks. For a standard computer vision neural network, we conduct a rigorous study systematically covering a variety of combinations of gradient targets and gradient guesses, including those previously presented in the literature. We find that using gradients obtained from a local loss as a candidate direction drastically improves on random noise in Forward Gradient methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06968v1-abstract-full').style.display = 'none'; document.getElementById('2306.06968v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Fortieth International Conference on Machine Learning, Jul 2023, Honolulu (Hawaii), USA, United States </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.03937">arXiv:2306.03937</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.03937">pdf</a>, <a href="https://arxiv.org/format/2306.03937">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Guiding The Last Layer in Federated Learning with Pre-Trained Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Legate%2C+G">Gwen Legate</a>, <a href="/search/cs?searchtype=author&amp;query=Bernier%2C+N">Nicolas Bernier</a>, <a href="/search/cs?searchtype=author&amp;query=Caccia%2C+L">Lucas Caccia</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.03937v2-abstract-short" style="display: inline;"> Federated Learning (FL) is an emerging paradigm that allows a model to be trained across a number of participants without sharing data. Recent works have begun to consider the effects of using pre-trained models as an initialization point for existing FL algorithms; however, these approaches ignore the vast body of efficient transfer learning literature from the centralized learning setting. Here&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03937v2-abstract-full').style.display = 'inline'; document.getElementById('2306.03937v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.03937v2-abstract-full" style="display: none;"> Federated Learning (FL) is an emerging paradigm that allows a model to be trained across a number of participants without sharing data. Recent works have begun to consider the effects of using pre-trained models as an initialization point for existing FL algorithms; however, these approaches ignore the vast body of efficient transfer learning literature from the centralized learning setting. Here we revisit the problem of FL from a pre-trained model considered in prior work and expand it to a set of computer vision transfer learning problems. We first observe that simply fitting a linear classification head can be efficient and effective in many cases. We then show that in the FL setting, fitting a classifier using the Nearest Class Means (NCM) can be done exactly and orders of magnitude more efficiently than existing proposals, while obtaining strong performance. Finally, we demonstrate that using a two-phase approach of obtaining the classifier and then fine-tuning the model can yield rapid convergence and improved generalization in the federated setting. We demonstrate the potential our method has to reduce communication and compute costs while achieving better model performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03937v2-abstract-full').style.display = 'none'; document.getElementById('2306.03937v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.05260">arXiv:2304.05260</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.05260">pdf</a>, <a href="https://arxiv.org/format/2304.05260">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Re-Weighted Softmax Cross-Entropy to Control Forgetting in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Legate%2C+G">Gwen Legate</a>, <a href="/search/cs?searchtype=author&amp;query=Caccia%2C+L">Lucas Caccia</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.05260v1-abstract-short" style="display: inline;"> In Federated Learning, a global model is learned by aggregating model updates computed at a set of independent client nodes, to reduce communication costs multiple gradient steps are performed at each node prior to aggregation. A key challenge in this setting is data heterogeneity across clients resulting in differing local objectives which can lead clients to overly minimize their own local objec&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.05260v1-abstract-full').style.display = 'inline'; document.getElementById('2304.05260v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.05260v1-abstract-full" style="display: none;"> In Federated Learning, a global model is learned by aggregating model updates computed at a set of independent client nodes, to reduce communication costs multiple gradient steps are performed at each node prior to aggregation. A key challenge in this setting is data heterogeneity across clients resulting in differing local objectives which can lead clients to overly minimize their own local objective, diverging from the global solution. We demonstrate that individual client models experience a catastrophic forgetting with respect to data from other clients and propose an efficient approach that modifies the cross-entropy objective on a per-client basis by re-weighting the softmax logits prior to computing the loss. This approach shields classes outside a client&#39;s label set from abrupt representation change and we empirically demonstrate it can alleviate client forgetting and provide consistent improvements to standard federated learning algorithms. Our method is particularly beneficial under the most challenging federated learning settings where data heterogeneity is high and client participation in each round is low. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.05260v1-abstract-full').style.display = 'none'; document.getElementById('2304.05260v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.04858">arXiv:2304.04858</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.04858">pdf</a>, <a href="https://arxiv.org/format/2304.04858">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Simulated Annealing in Early Layers Leads to Better Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sarfi%2C+A">Amirmohammad Sarfi</a>, <a href="/search/cs?searchtype=author&amp;query=Karimpour%2C+Z">Zahra Karimpour</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhary%2C+M">Muawiz Chaudhary</a>, <a href="/search/cs?searchtype=author&amp;query=Khalid%2C+N+M">Nasir M. Khalid</a>, <a href="/search/cs?searchtype=author&amp;query=Ravanelli%2C+M">Mirco Ravanelli</a>, <a href="/search/cs?searchtype=author&amp;query=Mudur%2C+S">Sudhir Mudur</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.04858v1-abstract-short" style="display: inline;"> Recently, a number of iterative learning methods have been introduced to improve generalization. These typically rely on training for longer periods of time in exchange for improved generalization. LLF (later-layer-forgetting) is a state-of-the-art method in this category. It strengthens learning in early layers by periodically re-initializing the last few layers of the network. Our principal inno&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.04858v1-abstract-full').style.display = 'inline'; document.getElementById('2304.04858v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.04858v1-abstract-full" style="display: none;"> Recently, a number of iterative learning methods have been introduced to improve generalization. These typically rely on training for longer periods of time in exchange for improved generalization. LLF (later-layer-forgetting) is a state-of-the-art method in this category. It strengthens learning in early layers by periodically re-initializing the last few layers of the network. Our principal innovation in this work is to use Simulated annealing in EArly Layers (SEAL) of the network in place of re-initialization of later layers. Essentially, later layers go through the normal gradient descent process, while the early layers go through short stints of gradient ascent followed by gradient descent. Extensive experiments on the popular Tiny-ImageNet dataset benchmark and a series of transfer learning and few-shot learning tasks show that we outperform LLF by a significant margin. We further show that, compared to normal training, LLF features, although improving on the target task, degrade the transfer learning performance across all datasets we explored. In comparison, our method outperforms LLF across the same target datasets by a large margin. We also show that the prediction depth of our method is significantly lower than that of LLF and normal training, indicating on average better prediction performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.04858v1-abstract-full').style.display = 'none'; document.getElementById('2304.04858v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.14771">arXiv:2303.14771</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.14771">pdf</a>, <a href="https://arxiv.org/format/2303.14771">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Prototype-Sample Relation Distillation: Towards Replay-Free Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Asadi%2C+N">Nader Asadi</a>, <a href="/search/cs?searchtype=author&amp;query=Davari%2C+M">MohammadReza Davari</a>, <a href="/search/cs?searchtype=author&amp;query=Mudur%2C+S">Sudhir Mudur</a>, <a href="/search/cs?searchtype=author&amp;query=Aljundi%2C+R">Rahaf Aljundi</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.14771v2-abstract-short" style="display: inline;"> In Continual learning (CL) balancing effective adaptation while combating catastrophic forgetting is a central challenge. Many of the recent best-performing methods utilize various forms of prior task data, e.g. a replay buffer, to tackle the catastrophic forgetting problem. Having access to previous task data can be restrictive in many real-world scenarios, for example when task data is sensitive&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.14771v2-abstract-full').style.display = 'inline'; document.getElementById('2303.14771v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.14771v2-abstract-full" style="display: none;"> In Continual learning (CL) balancing effective adaptation while combating catastrophic forgetting is a central challenge. Many of the recent best-performing methods utilize various forms of prior task data, e.g. a replay buffer, to tackle the catastrophic forgetting problem. Having access to previous task data can be restrictive in many real-world scenarios, for example when task data is sensitive or proprietary. To overcome the necessity of using previous tasks&#39; data, in this work, we start with strong representation learning methods that have been shown to be less prone to forgetting. We propose a holistic approach to jointly learn the representation and class prototypes while maintaining the relevance of old class prototypes and their embedded similarities. Specifically, samples are mapped to an embedding space where the representations are learned using a supervised contrastive loss. Class prototypes are evolved continually in the same latent space, enabling learning and prediction at any point. To continually adapt the prototypes without keeping any prior task data, we propose a novel distillation loss that constrains class prototypes to maintain relative similarities as compared to new task data. This method yields state-of-the-art performance in the task-incremental setting, outperforming methods relying on large amounts of data, and provides strong performance in the class-incremental setting without using any stored data points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.14771v2-abstract-full').style.display = 'none'; document.getElementById('2303.14771v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICML 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.06540">arXiv:2302.06540</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.06540">pdf</a>, <a href="https://arxiv.org/format/2302.06540">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Imitation from Observation With Bootstrapped Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonwa%2C+M">Medric Sonwa</a>, <a href="/search/cs?searchtype=author&amp;query=Hansen%2C+J">Johanna Hansen</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.06540v1-abstract-short" style="display: inline;"> Imitation from observation (IfO) is a learning paradigm that consists of training autonomous agents in a Markov Decision Process (MDP) by observing expert demonstrations without access to its actions. These demonstrations could be sequences of environment states or raw visual observations of the environment. Recent work in IfO has focused on this problem in the case of observations of low-dimensio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.06540v1-abstract-full').style.display = 'inline'; document.getElementById('2302.06540v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.06540v1-abstract-full" style="display: none;"> Imitation from observation (IfO) is a learning paradigm that consists of training autonomous agents in a Markov Decision Process (MDP) by observing expert demonstrations without access to its actions. These demonstrations could be sequences of environment states or raw visual observations of the environment. Recent work in IfO has focused on this problem in the case of observations of low-dimensional environment states, however, access to these highly-specific observations is unlikely in practice. In this paper, we adopt a challenging, but more realistic problem formulation, learning control policies that operate on a learned latent space with access only to visual demonstrations of an expert completing a task. We present BootIfOL, an IfO algorithm that aims to learn a reward function that takes an agent trajectory and compares it to an expert, providing rewards based on similarity to agent behavior and implicit goal. We consider this reward function to be a distance metric between trajectories of agent behavior and learn it via contrastive learning. The contrastive learning objective aims to closely represent expert trajectories and to distance them from non-expert trajectories. The set of non-expert trajectories used in contrastive learning is made progressively more complex by bootstrapping from roll-outs of the agent learned through RL using the current reward function. We evaluate our approach on a variety of control tasks showing that we can train effective policies using a limited number of demonstrative trajectories, greatly improving on prior approaches that consider raw observations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.06540v1-abstract-full').style.display = 'none'; document.getElementById('2302.06540v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.07635">arXiv:2301.07635</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.07635">pdf</a>, <a href="https://arxiv.org/format/2301.07635">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Local Learning with Neuron Groups </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Patel%2C+A">Adeetya Patel</a>, <a href="/search/cs?searchtype=author&amp;query=Eickenberg%2C+M">Michael Eickenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.07635v1-abstract-short" style="display: inline;"> Traditional deep network training methods optimize a monolithic objective function jointly for all the components. This can lead to various inefficiencies in terms of potential parallelization. Local learning is an approach to model-parallelism that removes the standard end-to-end learning setup and utilizes local objective functions to permit parallel learning amongst model components in a deep n&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.07635v1-abstract-full').style.display = 'inline'; document.getElementById('2301.07635v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.07635v1-abstract-full" style="display: none;"> Traditional deep network training methods optimize a monolithic objective function jointly for all the components. This can lead to various inefficiencies in terms of potential parallelization. Local learning is an approach to model-parallelism that removes the standard end-to-end learning setup and utilizes local objective functions to permit parallel learning amongst model components in a deep network. Recent works have demonstrated that variants of local learning can lead to efficient training of modern deep networks. However, in terms of how much computation can be distributed, these approaches are typically limited by the number of layers in a network. In this work we propose to study how local learning can be applied at the level of splitting layers or modules into sub-components, adding a notion of width-wise modularity to the existing depth-wise modularity associated with local learning. We investigate local-learning penalties that permit such models to be trained efficiently. Our experiments on the CIFAR-10, CIFAR-100, and Imagenet32 datasets demonstrate that introducing width-level modularity can lead to computational advantages over existing methods based on local learning and opens new opportunities for improved model-parallel distributed training. Code is available at: https://github.com/adeetyapatel12/GN-DGL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.07635v1-abstract-full').style.display = 'none'; document.getElementById('2301.07635v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.16156">arXiv:2210.16156</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.16156">pdf</a>, <a href="https://arxiv.org/format/2210.16156">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reliability of CKA as a Similarity Measure in Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Davari%2C+M">MohammadReza Davari</a>, <a href="/search/cs?searchtype=author&amp;query=Horoi%2C+S">Stefan Horoi</a>, <a href="/search/cs?searchtype=author&amp;query=Natik%2C+A">Amine Natik</a>, <a href="/search/cs?searchtype=author&amp;query=Lajoie%2C+G">Guillaume Lajoie</a>, <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.16156v2-abstract-short" style="display: inline;"> Comparing learned neural representations in neural networks is a challenging but important problem, which has been approached in different ways. The Centered Kernel Alignment (CKA) similarity metric, particularly its linear variant, has recently become a popular approach and has been widely used to compare representations of a network&#39;s different layers, of architecturally similar networks trained&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.16156v2-abstract-full').style.display = 'inline'; document.getElementById('2210.16156v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.16156v2-abstract-full" style="display: none;"> Comparing learned neural representations in neural networks is a challenging but important problem, which has been approached in different ways. The Centered Kernel Alignment (CKA) similarity metric, particularly its linear variant, has recently become a popular approach and has been widely used to compare representations of a network&#39;s different layers, of architecturally similar networks trained differently, or of models with different architectures trained on the same data. A wide variety of conclusions about similarity and dissimilarity of these various representations have been made using CKA. In this work we present analysis that formally characterizes CKA sensitivity to a large class of simple transformations, which can naturally occur in the context of modern machine learning. This provides a concrete explanation of CKA sensitivity to outliers, which has been observed in past works, and to transformations that preserve the linear separability of the data, an important generalization attribute. We empirically investigate several weaknesses of the CKA similarity metric, demonstrating situations in which it gives unexpected or counter-intuitive results. Finally we study approaches for modifying representations to maintain functional behaviour while changing the CKA value. Our results illustrate that, in many cases, the CKA value can be easily manipulated without substantial changes to the functional behaviour of the models, and call for caution when leveraging activation alignment metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.16156v2-abstract-full').style.display = 'none'; document.getElementById('2210.16156v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.13381">arXiv:2203.13381</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.13381">pdf</a>, <a href="https://arxiv.org/format/2203.13381">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Probing Representation Forgetting in Supervised and Unsupervised Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Davari%2C+M">MohammadReza Davari</a>, <a href="/search/cs?searchtype=author&amp;query=Asadi%2C+N">Nader Asadi</a>, <a href="/search/cs?searchtype=author&amp;query=Mudur%2C+S">Sudhir Mudur</a>, <a href="/search/cs?searchtype=author&amp;query=Aljundi%2C+R">Rahaf Aljundi</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.13381v2-abstract-short" style="display: inline;"> Continual Learning research typically focuses on tackling the phenomenon of catastrophic forgetting in neural networks. Catastrophic forgetting is associated with an abrupt loss of knowledge previously learned by a model when the task, or more broadly the data distribution, being trained on changes. In supervised learning problems this forgetting, resulting from a change in the model&#39;s representat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13381v2-abstract-full').style.display = 'inline'; document.getElementById('2203.13381v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.13381v2-abstract-full" style="display: none;"> Continual Learning research typically focuses on tackling the phenomenon of catastrophic forgetting in neural networks. Catastrophic forgetting is associated with an abrupt loss of knowledge previously learned by a model when the task, or more broadly the data distribution, being trained on changes. In supervised learning problems this forgetting, resulting from a change in the model&#39;s representation, is typically measured or observed by evaluating the decrease in old task performance. However, a model&#39;s representation can change without losing knowledge about prior tasks. In this work we consider the concept of representation forgetting, observed by using the difference in performance of an optimal linear classifier before and after a new task is introduced. Using this tool we revisit a number of standard continual learning benchmarks and observe that, through this lens, model representations trained without any explicit control for forgetting often experience small representation forgetting and can sometimes be comparable to methods which explicitly control for forgetting, especially in longer task sequences. We also show that representation forgetting can lead to new insights on the effect of model capacity and loss function used in continual learning. Based on our results, we show that a simple yet competitive approach is to learn representations continually with standard supervised contrastive learning while constructing prototypes of class samples when queried on old samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13381v2-abstract-full').style.display = 'none'; document.getElementById('2203.13381v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CVPR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.13333">arXiv:2203.13333</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.13333">pdf</a>, <a href="https://arxiv.org/format/2203.13333">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3550469.3555392">10.1145/3550469.3555392 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> CLIP-Mesh: Generating textured meshes from text using pretrained image-text models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Khalid%2C+N+M">Nasir Mohammad Khalid</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+T">Tianhao Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Popa%2C+T">Tiberiu Popa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.13333v2-abstract-short" style="display: inline;"> We present a technique for zero-shot generation of a 3D model using only a target text prompt. Without any 3D supervision our method deforms the control shape of a limit subdivided surface along with its texture map and normal map to obtain a 3D asset that corresponds to the input text prompt and can be easily deployed into games or modeling applications. We rely only on a pre-trained CLIP model t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13333v2-abstract-full').style.display = 'inline'; document.getElementById('2203.13333v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.13333v2-abstract-full" style="display: none;"> We present a technique for zero-shot generation of a 3D model using only a target text prompt. Without any 3D supervision our method deforms the control shape of a limit subdivided surface along with its texture map and normal map to obtain a 3D asset that corresponds to the input text prompt and can be easily deployed into games or modeling applications. We rely only on a pre-trained CLIP model that compares the input text prompt with differentiably rendered images of our 3D model. While previous works have focused on stylization or required training of generative models we perform optimization on mesh parameters directly to generate shape, texture or both. To constrain the optimization to produce plausible meshes and textures we introduce a number of techniques using image augmentations and the use of a pretrained prior that generates CLIP image embeddings given a text embedding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13333v2-abstract-full').style.display = 'none'; document.getElementById('2203.13333v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures, Accepted at SIGGRAPH ASIA 2022, Project Page at https://www.nasir.lol/clipmesh</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.13307">arXiv:2203.13307</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.13307">pdf</a>, <a href="https://arxiv.org/format/2203.13307">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Tackling Online One-Class Incremental Learning by Removing Negative Contrasts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Asadi%2C+N">Nader Asadi</a>, <a href="/search/cs?searchtype=author&amp;query=Mudur%2C+S">Sudhir Mudur</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.13307v1-abstract-short" style="display: inline;"> Recent work studies the supervised online continual learning setting where a learner receives a stream of data whose class distribution changes over time. Distinct from other continual learning settings the learner is presented new samples only once and must distinguish between all seen classes. A number of successful methods in this setting focus on storing and replaying a subset of samples along&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13307v1-abstract-full').style.display = 'inline'; document.getElementById('2203.13307v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.13307v1-abstract-full" style="display: none;"> Recent work studies the supervised online continual learning setting where a learner receives a stream of data whose class distribution changes over time. Distinct from other continual learning settings the learner is presented new samples only once and must distinguish between all seen classes. A number of successful methods in this setting focus on storing and replaying a subset of samples alongside incoming data in a computationally efficient manner. One recent proposal ER-AML achieved strong performance in this setting by applying an asymmetric loss based on contrastive learning to the incoming data and replayed data. However, a key ingredient of the proposed method is avoiding contrasts between incoming data and stored data, which makes it impractical for the setting where only one new class is introduced in each phase of the stream. In this work we adapt a recently proposed approach (\textit{BYOL}) from self-supervised learning to the supervised learning setting, unlocking the constraint on contrasts. We then show that supplementing this with additional regularization on class prototypes yields a new method that achieves strong performance in the one-class incremental learning setting and is competitive with the top performing methods in the multi-class incremental setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13307v1-abstract-full').style.display = 'none'; document.getElementById('2203.13307v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2021 Workshop on Distribution Shifts</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.03798">arXiv:2203.03798</a> <span>&nbsp;&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> New Insights on Reducing Abrupt Representation Change in Online Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Caccia%2C+L">Lucas Caccia</a>, <a href="/search/cs?searchtype=author&amp;query=Aljundi%2C+R">Rahaf Aljundi</a>, <a href="/search/cs?searchtype=author&amp;query=Asadi%2C+N">Nader Asadi</a>, <a href="/search/cs?searchtype=author&amp;query=Tuytelaars%2C+T">Tinne Tuytelaars</a>, <a href="/search/cs?searchtype=author&amp;query=Pineau%2C+J">Joelle Pineau</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.03798v3-abstract-short" style="display: inline;"> In the online continual learning paradigm, agents must learn from a changing distribution while respecting memory and compute constraints. Experience Replay (ER), where a small subset of past data is stored and replayed alongside new data, has emerged as a simple and effective learning strategy. In this work, we focus on the change in representations of observed data that arises when previously un&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03798v3-abstract-full').style.display = 'inline'; document.getElementById('2203.03798v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.03798v3-abstract-full" style="display: none;"> In the online continual learning paradigm, agents must learn from a changing distribution while respecting memory and compute constraints. Experience Replay (ER), where a small subset of past data is stored and replayed alongside new data, has emerged as a simple and effective learning strategy. In this work, we focus on the change in representations of observed data that arises when previously unobserved classes appear in the incoming data stream, and new classes must be distinguished from previous ones. We shed new light on this question by showing that applying ER causes the newly added classes&#39; representations to overlap significantly with the previous classes, leading to highly disruptive parameter updates. Based on this empirical analysis, we propose a new method which mitigates this issue by shielding the learned representations from drastic adaptation to accommodate new classes. We show that using an asymmetric update rule pushes new classes to adapt to the older ones (rather than the reverse), which is more effective especially at task boundaries, where much of the forgetting typically occurs. Empirical results show significant gains over strong baselines on standard continual learning benchmarks <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03798v3-abstract-full').style.display = 'none'; document.getElementById('2203.03798v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This has been withdrawn as it is a new version of arXiv:2104.05025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.13415">arXiv:2201.13415</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2201.13415">pdf</a>, <a href="https://arxiv.org/format/2201.13415">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Towards Scaling Difference Target Propagation by Learning Backprop Targets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ernoult%2C+M">Maxence Ernoult</a>, <a href="/search/cs?searchtype=author&amp;query=Normandin%2C+F">Fabrice Normandin</a>, <a href="/search/cs?searchtype=author&amp;query=Moudgil%2C+A">Abhinav Moudgil</a>, <a href="/search/cs?searchtype=author&amp;query=Spinney%2C+S">Sean Spinney</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&amp;query=Richards%2C+B">Blake Richards</a>, <a href="/search/cs?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.13415v1-abstract-short" style="display: inline;"> The development of biologically-plausible learning algorithms is important for understanding learning in the brain, but most of them fail to scale-up to real-world tasks, limiting their potential as explanations for learning by real brains. As such, it is important to explore learning algorithms that come with strong theoretical guarantees and can match the performance of backpropagation (BP) on c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.13415v1-abstract-full').style.display = 'inline'; document.getElementById('2201.13415v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.13415v1-abstract-full" style="display: none;"> The development of biologically-plausible learning algorithms is important for understanding learning in the brain, but most of them fail to scale-up to real-world tasks, limiting their potential as explanations for learning by real brains. As such, it is important to explore learning algorithms that come with strong theoretical guarantees and can match the performance of backpropagation (BP) on complex tasks. One such algorithm is Difference Target Propagation (DTP), a biologically-plausible learning algorithm whose close relation with Gauss-Newton (GN) optimization has been recently established. However, the conditions under which this connection rigorously holds preclude layer-wise training of the feedback pathway synaptic weights (which is more biologically plausible). Moreover, good alignment between DTP weight updates and loss gradients is only loosely guaranteed and under very specific conditions for the architecture being trained. In this paper, we propose a novel feedback weight training scheme that ensures both that DTP approximates BP and that layer-wise feedback weight training can be restored without sacrificing any theoretical guarantees. Our theory is corroborated by experimental results and we report the best performance ever achieved by DTP on CIFAR-10 and ImageNet 32$\times$32 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.13415v1-abstract-full').style.display = 'none'; document.getElementById('2201.13415v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.11986">arXiv:2201.11986</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2201.11986">pdf</a>, <a href="https://arxiv.org/format/2201.11986">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gradient Masked Averaging for Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tenison%2C+I">Irene Tenison</a>, <a href="/search/cs?searchtype=author&amp;query=Sreeramadas%2C+S+A">Sai Aravind Sreeramadas</a>, <a href="/search/cs?searchtype=author&amp;query=Mugunthan%2C+V">Vaikkunth Mugunthan</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a>, <a href="/search/cs?searchtype=author&amp;query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.11986v2-abstract-short" style="display: inline;"> Federated learning (FL) is an emerging paradigm that permits a large number of clients with heterogeneous data to coordinate learning of a unified global model without the need to share data amongst each other. A major challenge in federated learning is the heterogeneity of data across client, which can degrade the performance of standard FL algorithms. Standard FL algorithms involve averaging of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11986v2-abstract-full').style.display = 'inline'; document.getElementById('2201.11986v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.11986v2-abstract-full" style="display: none;"> Federated learning (FL) is an emerging paradigm that permits a large number of clients with heterogeneous data to coordinate learning of a unified global model without the need to share data amongst each other. A major challenge in federated learning is the heterogeneity of data across client, which can degrade the performance of standard FL algorithms. Standard FL algorithms involve averaging of model parameters or gradient updates to approximate the global model at the server. However, we argue that in heterogeneous settings, averaging can result in information loss and lead to poor generalization due to the bias induced by dominant client gradients. We hypothesize that to generalize better across non-i.i.d datasets, the algorithms should focus on learning the invariant mechanism that is constant while ignoring spurious mechanisms that differ across clients. Inspired from recent works in Out-of-Distribution generalization, we propose a gradient masked averaging approach for FL as an alternative to the standard averaging of client updates. This aggregation technique for client updates can be adapted as a drop-in replacement in most existing federated algorithms. We perform extensive experiments on multiple FL algorithms with in-distribution, real-world, feature-skewed out-of-distribution, and quantity imbalanced datasets and show that it provides consistent improvements, particularly in the case of heterogeneous clients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11986v2-abstract-full').style.display = 'none'; document.getElementById('2201.11986v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.09539">arXiv:2107.09539</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.09539">pdf</a>, <a href="https://arxiv.org/format/2107.09539">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Parametric Scattering Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gauthier%2C+S">Shanel Gauthier</a>, <a href="/search/cs?searchtype=author&amp;query=Th%C3%A9rien%2C+B">Benjamin Th茅rien</a>, <a href="/search/cs?searchtype=author&amp;query=Als%C3%A8ne-Racicot%2C+L">Laurent Als猫ne-Racicot</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhary%2C+M">Muawiz Chaudhary</a>, <a href="/search/cs?searchtype=author&amp;query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Eickenberg%2C+M">Michael Eickenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+G">Guy Wolf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.09539v4-abstract-short" style="display: inline;"> The wavelet scattering transform creates geometric invariants and deformation stability. In multiple signal domains, it has been shown to yield more discriminative representations compared to other non-learned representations and to outperform learned representations in certain tasks, particularly on limited labeled data and highly structured signals. The wavelet filters used in the scattering tra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.09539v4-abstract-full').style.display = 'inline'; document.getElementById('2107.09539v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.09539v4-abstract-full" style="display: none;"> The wavelet scattering transform creates geometric invariants and deformation stability. In multiple signal domains, it has been shown to yield more discriminative representations compared to other non-learned representations and to outperform learned representations in certain tasks, particularly on limited labeled data and highly structured signals. The wavelet filters used in the scattering transform are typically selected to create a tight frame via a parameterized mother wavelet. In this work, we investigate whether this standard wavelet filterbank construction is optimal. Focusing on Morlet wavelets, we propose to learn the scales, orientations, and aspect ratios of the filters to produce problem-specific parameterizations of the scattering transform. We show that our learned versions of the scattering transform yield significant performance gains in small-sample classification settings over the standard scattering transform. Moreover, our empirical results suggest that traditional filterbank constructions may not always be necessary for scattering transforms to extract effective representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.09539v4-abstract-full').style.display = 'none'; document.getElementById('2107.09539v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> F.2.2; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06440">arXiv:2106.06440</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.06440">pdf</a>, <a href="https://arxiv.org/format/2106.06440">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning Compositional Shape Priors for Few-Shot 3D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Michalkiewicz%2C+M">Mateusz Michalkiewicz</a>, <a href="/search/cs?searchtype=author&amp;query=Tsogkas%2C+S">Stavros Tsogkas</a>, <a href="/search/cs?searchtype=author&amp;query=Parisot%2C+S">Sarah Parisot</a>, <a href="/search/cs?searchtype=author&amp;query=Baktashmotlagh%2C+M">Mahsa Baktashmotlagh</a>, <a href="/search/cs?searchtype=author&amp;query=Eriksson%2C+A">Anders Eriksson</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06440v2-abstract-short" style="display: inline;"> The impressive performance of deep convolutional neural networks in single-view 3D reconstruction suggests that these models perform non-trivial reasoning about the 3D structure of the output space. Recent work has challenged this belief, showing that, on standard benchmarks, complex encoder-decoder architectures perform similarly to nearest-neighbor baselines or simple linear decoder models that&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06440v2-abstract-full').style.display = 'inline'; document.getElementById('2106.06440v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06440v2-abstract-full" style="display: none;"> The impressive performance of deep convolutional neural networks in single-view 3D reconstruction suggests that these models perform non-trivial reasoning about the 3D structure of the output space. Recent work has challenged this belief, showing that, on standard benchmarks, complex encoder-decoder architectures perform similarly to nearest-neighbor baselines or simple linear decoder models that exploit large amounts of per-category data. However, building large collections of 3D shapes for supervised training is a laborious process; a more realistic and less constraining task is inferring 3D shapes for categories with few available training examples, calling for a model that can successfully generalize to novel object classes. In this work we experimentally demonstrate that naive baselines fail in this few-shot learning setting, in which the network must learn informative shape priors for inference of new categories. We propose three ways to learn a class-specific global shape prior, directly from data. Using these techniques, we are able to capture multi-scale information about the 3D shape, and account for intra-class variability by virtue of an implicit compositional structure. Experiments on the popular ShapeNet dataset show that our method outperforms a zero-shot baseline by over 40%, and the current state-of-the-art by over 10%, in terms of relative performance, in the few-shot setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06440v2-abstract-full').style.display = 'none'; document.getElementById('2106.06440v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 12 figures. arXiv admin note: substantial text overlap with arXiv:2004.06302</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06401">arXiv:2106.06401</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.06401">pdf</a>, <a href="https://arxiv.org/format/2106.06401">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Decoupled Greedy Learning of CNNs for Synchronous and Asynchronous Distributed Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Leconte%2C+L">Louis Leconte</a>, <a href="/search/cs?searchtype=author&amp;query=Caccia%2C+L">Lucas Caccia</a>, <a href="/search/cs?searchtype=author&amp;query=Eickenberg%2C+M">Michael Eickenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06401v1-abstract-short" style="display: inline;"> A commonly cited inefficiency of neural network training using back-propagation is the update locking problem: each layer must wait for the signal to propagate through the full network before updating. Several alternatives that can alleviate this issue have been proposed. In this context, we consider a simple alternative based on minimal feedback, which we call Decoupled Greedy Learning (DGL). It&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06401v1-abstract-full').style.display = 'inline'; document.getElementById('2106.06401v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06401v1-abstract-full" style="display: none;"> A commonly cited inefficiency of neural network training using back-propagation is the update locking problem: each layer must wait for the signal to propagate through the full network before updating. Several alternatives that can alleviate this issue have been proposed. In this context, we consider a simple alternative based on minimal feedback, which we call Decoupled Greedy Learning (DGL). It is based on a classic greedy relaxation of the joint training objective, recently shown to be effective in the context of Convolutional Neural Networks (CNNs) on large-scale image classification. We consider an optimization of this objective that permits us to decouple the layer training, allowing for layers or modules in networks to be trained with a potentially linear parallelization. With the use of a replay buffer we show that this approach can be extended to asynchronous settings, where modules can operate and continue to update with possibly large communication delays. To address bandwidth and memory issues we propose an approach based on online vector quantization. This allows to drastically reduce the communication bandwidth between modules and required memory for replay buffers. We show theoretically and empirically that this approach converges and compare it to the sequential solvers. We demonstrate the effectiveness of DGL against alternative approaches on the CIFAR-10 dataset and on the large-scale ImageNet dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06401v1-abstract-full').style.display = 'none'; document.getElementById('2106.06401v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:1901.08164</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.05025">arXiv:2104.05025</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.05025">pdf</a>, <a href="https://arxiv.org/format/2104.05025">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> New Insights on Reducing Abrupt Representation Change in Online Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Caccia%2C+L">Lucas Caccia</a>, <a href="/search/cs?searchtype=author&amp;query=Aljundi%2C+R">Rahaf Aljundi</a>, <a href="/search/cs?searchtype=author&amp;query=Asadi%2C+N">Nader Asadi</a>, <a href="/search/cs?searchtype=author&amp;query=Tuytelaars%2C+T">Tinne Tuytelaars</a>, <a href="/search/cs?searchtype=author&amp;query=Pineau%2C+J">Joelle Pineau</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.05025v3-abstract-short" style="display: inline;"> In the online continual learning paradigm, agents must learn from a changing distribution while respecting memory and compute constraints. Experience Replay (ER), where a small subset of past data is stored and replayed alongside new data, has emerged as a simple and effective learning strategy. In this work, we focus on the change in representations of observed data that arises when previously un&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.05025v3-abstract-full').style.display = 'inline'; document.getElementById('2104.05025v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.05025v3-abstract-full" style="display: none;"> In the online continual learning paradigm, agents must learn from a changing distribution while respecting memory and compute constraints. Experience Replay (ER), where a small subset of past data is stored and replayed alongside new data, has emerged as a simple and effective learning strategy. In this work, we focus on the change in representations of observed data that arises when previously unobserved classes appear in the incoming data stream, and new classes must be distinguished from previous ones. We shed new light on this question by showing that applying ER causes the newly added classes&#39; representations to overlap significantly with the previous classes, leading to highly disruptive parameter updates. Based on this empirical analysis, we propose a new method which mitigates this issue by shielding the learned representations from drastic adaptation to accommodate new classes. We show that using an asymmetric update rule pushes new classes to adapt to the older ones (rather than the reverse), which is more effective especially at task boundaries, where much of the forgetting typically occurs. Empirical results show significant gains over strong baselines on standard continual learning benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.05025v3-abstract-full').style.display = 'none'; document.getElementById('2104.05025v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICLR 2022. Code available at www.github.com/pclucas14/AML</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.07528">arXiv:2101.07528</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2101.07528">pdf</a>, <a href="https://arxiv.org/format/2101.07528">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Unreasonable Effectiveness of Patches in Deep Convolutional Kernels Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Thiry%2C+L">Louis Thiry</a>, <a href="/search/cs?searchtype=author&amp;query=Arbel%2C+M">Michael Arbel</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.07528v1-abstract-short" style="display: inline;"> A recent line of work showed that various forms of convolutional kernel methods can be competitive with standard supervised deep convolutional networks on datasets like CIFAR-10, obtaining accuracies in the range of 87-90% while being more amenable to theoretical analysis. In this work, we highlight the importance of a data-dependent feature extraction step that is key to the obtain good performan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.07528v1-abstract-full').style.display = 'inline'; document.getElementById('2101.07528v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.07528v1-abstract-full" style="display: none;"> A recent line of work showed that various forms of convolutional kernel methods can be competitive with standard supervised deep convolutional networks on datasets like CIFAR-10, obtaining accuracies in the range of 87-90% while being more amenable to theoretical analysis. In this work, we highlight the importance of a data-dependent feature extraction step that is key to the obtain good performance in convolutional kernel methods. This step typically corresponds to a whitened dictionary of patches, and gives rise to a data-driven convolutional kernel methods. We extensively study its effect, demonstrating it is the key ingredient for high performance of these methods. Specifically, we show that one of the simplest instances of such kernel methods, based on a single layer of image patches followed by a linear classifier is already obtaining classification accuracies on CIFAR-10 in the same range as previous more sophisticated convolutional kernel methods. We scale this method to the challenging ImageNet dataset, showing such a simple approach can exceed all existing non-learned representation methods. This is a new baseline for object recognition without representation learning methods, that initiates the investigation of convolutional kernel models on ImageNet. We conduct experiments to analyze the dictionary that we used, our ablations showing they exhibit low-dimensional properties. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.07528v1-abstract-full').style.display = 'none'; document.getElementById('2101.07528v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Conference on Learning Representation (ICLR 2021), 2021, Vienna (online), Austria </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.05756">arXiv:2007.05756</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.05756">pdf</a>, <a href="https://arxiv.org/format/2007.05756">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Generative Compositional Augmentations for Scene Graph Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Knyazev%2C+B">Boris Knyazev</a>, <a href="/search/cs?searchtype=author&amp;query=de+Vries%2C+H">Harm de Vries</a>, <a href="/search/cs?searchtype=author&amp;query=Cangea%2C+C">C膬t膬lina Cangea</a>, <a href="/search/cs?searchtype=author&amp;query=Taylor%2C+G+W">Graham W. Taylor</a>, <a href="/search/cs?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.05756v3-abstract-short" style="display: inline;"> Inferring objects and their relationships from an image in the form of a scene graph is useful in many applications at the intersection of vision and language. We consider a challenging problem of compositional generalization that emerges in this task due to a long tail data distribution. Current scene graph generation models are trained on a tiny fraction of the distribution corresponding to the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05756v3-abstract-full').style.display = 'inline'; document.getElementById('2007.05756v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.05756v3-abstract-full" style="display: none;"> Inferring objects and their relationships from an image in the form of a scene graph is useful in many applications at the intersection of vision and language. We consider a challenging problem of compositional generalization that emerges in this task due to a long tail data distribution. Current scene graph generation models are trained on a tiny fraction of the distribution corresponding to the most frequent compositions, e.g. &lt;cup, on, table&gt;. However, test images might contain zero- and few-shot compositions of objects and relationships, e.g. &lt;cup, on, surfboard&gt;. Despite each of the object categories and the predicate (e.g. &#39;on&#39;) being frequent in the training data, the models often fail to properly understand such unseen or rare compositions. To improve generalization, it is natural to attempt increasing the diversity of the training distribution. However, in the graph domain this is non-trivial. To that end, we propose a method to synthesize rare yet plausible scene graphs by perturbing real ones. We then propose and empirically study a model based on conditional generative adversarial networks (GANs) that allows us to generate visual features of perturbed scene graphs and learn from them in a joint fashion. When evaluated on the Visual Genome dataset, our approach yields marginal, but consistent improvements in zero- and few-shot metrics. We analyze the limitations of our approach indicating promising directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05756v3-abstract-full').style.display = 'none'; document.getElementById('2007.05756v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2021 camera ready. Added more baselines, combining GANs with Neural Motifs and t-sne visualizations. Code is available at https://github.com/bknyaz/sgg</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.08230">arXiv:2005.08230</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.08230">pdf</a>, <a href="https://arxiv.org/format/2005.08230">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Graph Density-Aware Losses for Novel Compositions in Scene Graph Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Knyazev%2C+B">Boris Knyazev</a>, <a href="/search/cs?searchtype=author&amp;query=de+Vries%2C+H">Harm de Vries</a>, <a href="/search/cs?searchtype=author&amp;query=Cangea%2C+C">C膬t膬lina Cangea</a>, <a href="/search/cs?searchtype=author&amp;query=Taylor%2C+G+W">Graham W. Taylor</a>, <a href="/search/cs?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.08230v2-abstract-short" style="display: inline;"> Scene graph generation (SGG) aims to predict graph-structured descriptions of input images, in the form of objects and relationships between them. This task is becoming increasingly useful for progress at the interface of vision and language. Here, it is important - yet challenging - to perform well on novel (zero-shot) or rare (few-shot) compositions of objects and relationships. In this paper, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08230v2-abstract-full').style.display = 'inline'; document.getElementById('2005.08230v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.08230v2-abstract-full" style="display: none;"> Scene graph generation (SGG) aims to predict graph-structured descriptions of input images, in the form of objects and relationships between them. This task is becoming increasingly useful for progress at the interface of vision and language. Here, it is important - yet challenging - to perform well on novel (zero-shot) or rare (few-shot) compositions of objects and relationships. In this paper, we identify two key issues that limit such generalization. Firstly, we show that the standard loss used in this task is unintentionally a function of scene graph density. This leads to the neglect of individual edges in large sparse graphs during training, even though these contain diverse few-shot examples that are important for generalization. Secondly, the frequency of relationships can create a strong bias in this task, such that a blind model predicting the most frequent relationship achieves good performance. Consequently, some state-of-the-art models exploit this bias to improve results. We show that such models can suffer the most in their ability to generalize to rare compositions, evaluating two different models on the Visual Genome dataset and its more recent, improved version, GQA. To address these issues, we introduce a density-normalized edge loss, which provides more than a two-fold improvement in certain generalization metrics. Compared to other works in this direction, our enhancements require only a few lines of code and no added computational cost. We also highlight the difficulty of accurately evaluating models using existing metrics, especially on zero/few shots, and introduce a novel weighted metric. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08230v2-abstract-full').style.display = 'none'; document.getElementById('2005.08230v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at BMVC 2020, the code is available at https://github.com/bknyaz/sgg</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.04623">arXiv:2005.04623</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.04623">pdf</a>, <a href="https://arxiv.org/format/2005.04623">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Simple and Scalable Shape Representation for 3D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Michalkiewicz%2C+M">Mateusz Michalkiewicz</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Baktashmotlagh%2C+M">Mahsa Baktashmotlagh</a>, <a href="/search/cs?searchtype=author&amp;query=Eriksson%2C+A">Anders Eriksson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.04623v1-abstract-short" style="display: inline;"> Deep learning applied to the reconstruction of 3D shapes has seen growing interest. A popular approach to 3D reconstruction and generation in recent years has been the CNN encoder-decoder model usually applied in voxel space. However, this often scales very poorly with the resolution limiting the effectiveness of these models. Several sophisticated alternatives for decoding to 3D shapes have been&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.04623v1-abstract-full').style.display = 'inline'; document.getElementById('2005.04623v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.04623v1-abstract-full" style="display: none;"> Deep learning applied to the reconstruction of 3D shapes has seen growing interest. A popular approach to 3D reconstruction and generation in recent years has been the CNN encoder-decoder model usually applied in voxel space. However, this often scales very poorly with the resolution limiting the effectiveness of these models. Several sophisticated alternatives for decoding to 3D shapes have been proposed typically relying on complex deep learning architectures for the decoder model. In this work, we show that this additional complexity is not necessary, and that we can actually obtain high quality 3D reconstruction using a linear decoder, obtained from principal component analysis on the signed distance function (SDF) of the surface. This approach allows easily scaling to larger resolutions. We show in multiple experiments that our approach is competitive with state-of-the-art methods. It also allows the decoder to be fine-tuned on the target task using a loss designed specifically for SDF transforms, obtaining further gains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.04623v1-abstract-full').style.display = 'none'; document.getElementById('2005.04623v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages plus 3 pages of references. 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 65D19 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.06302">arXiv:2004.06302</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2004.06302">pdf</a>, <a href="https://arxiv.org/format/2004.06302">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Few-Shot Single-View 3-D Object Reconstruction with Compositional Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Michalkiewicz%2C+M">Mateusz Michalkiewicz</a>, <a href="/search/cs?searchtype=author&amp;query=Parisot%2C+S">Sarah Parisot</a>, <a href="/search/cs?searchtype=author&amp;query=Tsogkas%2C+S">Stavros Tsogkas</a>, <a href="/search/cs?searchtype=author&amp;query=Baktashmotlagh%2C+M">Mahsa Baktashmotlagh</a>, <a href="/search/cs?searchtype=author&amp;query=Eriksson%2C+A">Anders Eriksson</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.06302v2-abstract-short" style="display: inline;"> The impressive performance of deep convolutional neural networks in single-view 3D reconstruction suggests that these models perform non-trivial reasoning about the 3D structure of the output space. However, recent work has challenged this belief, showing that complex encoder-decoder architectures perform similarly to nearest-neighbor baselines or simple linear decoder models that exploit large am&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.06302v2-abstract-full').style.display = 'inline'; document.getElementById('2004.06302v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.06302v2-abstract-full" style="display: none;"> The impressive performance of deep convolutional neural networks in single-view 3D reconstruction suggests that these models perform non-trivial reasoning about the 3D structure of the output space. However, recent work has challenged this belief, showing that complex encoder-decoder architectures perform similarly to nearest-neighbor baselines or simple linear decoder models that exploit large amounts of per category data in standard benchmarks. On the other hand settings where 3D shape must be inferred for new categories with few examples are more natural and require models that generalize about shapes. In this work we demonstrate experimentally that naive baselines do not apply when the goal is to learn to reconstruct novel objects using very few examples, and that in a \emph{few-shot} learning setting, the network must learn concepts that can be applied to new categories, avoiding rote memorization. To address deficiencies in existing approaches to this problem, we propose three approaches that efficiently integrate a class prior into a 3D reconstruction model, allowing to account for intra-class variability and imposing an implicit compositional structure that the model should learn. Experiments on the popular ShapeNet database demonstrate that our method significantly outperform existing baselines on this task in the few-shot setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.06302v2-abstract-full').style.display = 'none'; document.getElementById('2004.06302v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.08019">arXiv:1911.08019</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1911.08019">pdf</a>, <a href="https://arxiv.org/format/1911.08019">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Online Learned Continual Compression with Adaptive Quantization Modules </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Caccia%2C+L">Lucas Caccia</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Caccia%2C+M">Massimo Caccia</a>, <a href="/search/cs?searchtype=author&amp;query=Pineau%2C+J">Joelle Pineau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.08019v3-abstract-short" style="display: inline;"> We introduce and study the problem of Online Continual Compression, where one attempts to simultaneously learn to compress and store a representative dataset from a non i.i.d data stream, while only observing each sample once. A naive application of auto-encoders in this setting encounters a major challenge: representations derived from earlier encoder states must be usable by later decoder states&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.08019v3-abstract-full').style.display = 'inline'; document.getElementById('1911.08019v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.08019v3-abstract-full" style="display: none;"> We introduce and study the problem of Online Continual Compression, where one attempts to simultaneously learn to compress and store a representative dataset from a non i.i.d data stream, while only observing each sample once. A naive application of auto-encoders in this setting encounters a major challenge: representations derived from earlier encoder states must be usable by later decoder states. We show how to use discrete auto-encoders to effectively address this challenge and introduce Adaptive Quantization Modules (AQM) to control variation in the compression ability of the module at any given stage of learning. This enables selecting an appropriate compression for incoming samples, while taking into account overall memory constraints and current progress of the learned compression. Unlike previous methods, our approach does not require any pretraining, even on challenging datasets. We show that using AQM to replace standard episodic memory in continual learning settings leads to significant gains on continual learning benchmarks. Furthermore we demonstrate this approach with larger images, LiDAR, and reinforcement learning environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.08019v3-abstract-full').style.display = 'none'; document.getElementById('1911.08019v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.04950">arXiv:1908.04950</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1908.04950">pdf</a>, <a href="https://arxiv.org/format/1908.04950">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VideoNavQA: Bridging the Gap between Visual and Embodied Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cangea%2C+C">C膬t膬lina Cangea</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Li%C3%B2%2C+P">Pietro Li貌</a>, <a href="/search/cs?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.04950v1-abstract-short" style="display: inline;"> Embodied Question Answering (EQA) is a recently proposed task, where an agent is placed in a rich 3D environment and must act based solely on its egocentric input to answer a given question. The desired outcome is that the agent learns to combine capabilities such as scene understanding, navigation and language understanding in order to perform complex reasoning in the visual world. However, initi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.04950v1-abstract-full').style.display = 'inline'; document.getElementById('1908.04950v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.04950v1-abstract-full" style="display: none;"> Embodied Question Answering (EQA) is a recently proposed task, where an agent is placed in a rich 3D environment and must act based solely on its egocentric input to answer a given question. The desired outcome is that the agent learns to combine capabilities such as scene understanding, navigation and language understanding in order to perform complex reasoning in the visual world. However, initial advancements combining standard vision and language methods with imitation and reinforcement learning algorithms have shown EQA might be too complex and challenging for these techniques. In order to investigate the feasibility of EQA-type tasks, we build the VideoNavQA dataset that contains pairs of questions and videos generated in the House3D environment. The goal of this dataset is to assess question-answering performance from nearly-ideal navigation paths, while considering a much more complete variety of questions than current instantiations of the EQA task. We investigate several models, adapted from popular VQA methods, on this new benchmark. This establishes an initial understanding of how well VQA-style methods can perform within this novel EQA paradigm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.04950v1-abstract-full').style.display = 'none'; document.getElementById('1908.04950v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at BMVC 2019. 15 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.04742">arXiv:1908.04742</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1908.04742">pdf</a>, <a href="https://arxiv.org/format/1908.04742">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Online Continual Learning with Maximally Interfered Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aljundi%2C+R">Rahaf Aljundi</a>, <a href="/search/cs?searchtype=author&amp;query=Caccia%2C+L">Lucas Caccia</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Caccia%2C+M">Massimo Caccia</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+M">Min Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Charlin%2C+L">Laurent Charlin</a>, <a href="/search/cs?searchtype=author&amp;query=Tuytelaars%2C+T">Tinne Tuytelaars</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.04742v3-abstract-short" style="display: inline;"> Continual learning, the setting where a learning agent is faced with a never ending stream of data, continues to be a great challenge for modern machine learning systems. In particular the online or &#34;single-pass through the data&#34; setting has gained attention recently as a natural setting that is difficult to tackle. Methods based on replay, either generative or from a stored memory, have been show&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.04742v3-abstract-full').style.display = 'inline'; document.getElementById('1908.04742v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.04742v3-abstract-full" style="display: none;"> Continual learning, the setting where a learning agent is faced with a never ending stream of data, continues to be a great challenge for modern machine learning systems. In particular the online or &#34;single-pass through the data&#34; setting has gained attention recently as a natural setting that is difficult to tackle. Methods based on replay, either generative or from a stored memory, have been shown to be effective approaches for continual learning, matching or exceeding the state of the art in a number of standard benchmarks. These approaches typically rely on randomly selecting samples from the replay memory or from a generative model, which is suboptimal. In this work, we consider a controlled sampling of memories for replay. We retrieve the samples which are most interfered, i.e. whose prediction will be most negatively impacted by the foreseen parameters update. We show a formulation for this sampling criterion in both the generative replay and the experience replay setting, producing consistent gains in performance and greatly reduced forgetting. We release an implementation of our method at https://github.com/optimass/Maximally_Interfered_Retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.04742v3-abstract-full').style.display = 'none'; document.getElementById('1908.04742v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NeurIPS 2019 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1901.08164">arXiv:1901.08164</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1901.08164">pdf</a>, <a href="https://arxiv.org/format/1901.08164">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Decoupled Greedy Learning of CNNs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Eickenberg%2C+M">Michael Eickenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1901.08164v4-abstract-short" style="display: inline;"> A commonly cited inefficiency of neural network training by back-propagation is the update locking problem: each layer must wait for the signal to propagate through the full network before updating. Several alternatives that can alleviate this issue have been proposed. In this context, we consider a simpler, but more effective, substitute that uses minimal feedback, which we call Decoupled Greedy&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.08164v4-abstract-full').style.display = 'inline'; document.getElementById('1901.08164v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1901.08164v4-abstract-full" style="display: none;"> A commonly cited inefficiency of neural network training by back-propagation is the update locking problem: each layer must wait for the signal to propagate through the full network before updating. Several alternatives that can alleviate this issue have been proposed. In this context, we consider a simpler, but more effective, substitute that uses minimal feedback, which we call Decoupled Greedy Learning (DGL). It is based on a greedy relaxation of the joint training objective, recently shown to be effective in the context of Convolutional Neural Networks (CNNs) on large-scale image classification. We consider an optimization of this objective that permits us to decouple the layer training, allowing for layers or modules in networks to be trained with a potentially linear parallelization in layers. With the use of a replay buffer we show this approach can be extended to asynchronous settings, where modules can operate with possibly large communication delays. We show theoretically and empirically that this approach converges. Then, we empirically find that it can lead to better generalization than sequential greedy optimization. We demonstrate the effectiveness of DGL against alternative approaches on the CIFAR-10 dataset and on the large-scale ImageNet dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.08164v4-abstract-full').style.display = 'none'; document.getElementById('1901.08164v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.11446">arXiv:1812.11446</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1812.11446">pdf</a>, <a href="https://arxiv.org/format/1812.11446">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Greedy Layerwise Learning Can Scale to ImageNet </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Eickenberg%2C+M">Michael Eickenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.11446v3-abstract-short" style="display: inline;"> Shallow supervised 1-hidden layer neural networks have a number of favorable properties that make them easier to interpret, analyze, and optimize than their deep counterparts, but lack their representational power. Here we use 1-hidden layer learning problems to sequentially build deep networks layer by layer, which can inherit properties from shallow networks. Contrary to previous approaches usin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.11446v3-abstract-full').style.display = 'inline'; document.getElementById('1812.11446v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.11446v3-abstract-full" style="display: none;"> Shallow supervised 1-hidden layer neural networks have a number of favorable properties that make them easier to interpret, analyze, and optimize than their deep counterparts, but lack their representational power. Here we use 1-hidden layer learning problems to sequentially build deep networks layer by layer, which can inherit properties from shallow networks. Contrary to previous approaches using shallow networks, we focus on problems where deep learning is reported as critical for success. We thus study CNNs on image classification tasks using the large-scale ImageNet dataset and the CIFAR-10 dataset. Using a simple set of ideas for architecture and training we find that solving sequential 1-hidden-layer auxiliary problems lead to a CNN that exceeds AlexNet performance on ImageNet. Extending this training methodology to construct individual layers by solving 2-and-3-hidden layer auxiliary problems, we obtain an 11-layer network that exceeds several members of the VGG model family on ImageNet, and can train a VGG-11 model to the same accuracy as end-to-end learning. To our knowledge, this is the first competitive alternative to end-to-end training of CNNs that can scale to ImageNet. We illustrate several interesting properties of these models theoretically and conduct a range of experiments to study the properties this training induces on the intermediate layers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.11446v3-abstract-full').style.display = 'none'; document.getElementById('1812.11446v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.11214">arXiv:1812.11214</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1812.11214">pdf</a>, <a href="https://arxiv.org/ps/1812.11214">ps</a>, <a href="https://arxiv.org/format/1812.11214">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Kymatio: Scattering Transforms in Python </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Andreux%2C+M">Mathieu Andreux</a>, <a href="/search/cs?searchtype=author&amp;query=Angles%2C+T">Tom谩s Angles</a>, <a href="/search/cs?searchtype=author&amp;query=Exarchakis%2C+G">Georgios Exarchakis</a>, <a href="/search/cs?searchtype=author&amp;query=Leonarduzzi%2C+R">Roberto Leonarduzzi</a>, <a href="/search/cs?searchtype=author&amp;query=Rochette%2C+G">Gaspar Rochette</a>, <a href="/search/cs?searchtype=author&amp;query=Thiry%2C+L">Louis Thiry</a>, <a href="/search/cs?searchtype=author&amp;query=Zarka%2C+J">John Zarka</a>, <a href="/search/cs?searchtype=author&amp;query=Mallat%2C+S">St茅phane Mallat</a>, <a href="/search/cs?searchtype=author&amp;query=and%C3%A9n%2C+J">Joakim and茅n</a>, <a href="/search/cs?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Bruna%2C+J">Joan Bruna</a>, <a href="/search/cs?searchtype=author&amp;query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhary%2C+M">Muawiz Chaudhary</a>, <a href="/search/cs?searchtype=author&amp;query=Hirn%2C+M+J">Matthew J. Hirn</a>, <a href="/search/cs?searchtype=author&amp;query=Oyallon%2C+E">Edouard Oyallon</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Sixin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Cella%2C+C">Carmine Cella</a>, <a href="/search/cs?searchtype=author&amp;query=Eickenberg%2C+M">Michael Eickenberg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.11214v3-abstract-short" style="display: inline;"> The wavelet scattering transform is an invariant signal representation suitable for many signal processing and machine learning applications. We present the Kymatio software package, an easy-to-use, high-performance Python implementation of the scattering transform in 1D, 2D, and 3D that is compatible with modern deep learning frameworks. All transforms may be executed on a GPU (in addition to CPU&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.11214v3-abstract-full').style.display = 'inline'; document.getElementById('1812.11214v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.11214v3-abstract-full" style="display: none;"> The wavelet scattering transform is an invariant signal representation suitable for many signal processing and machine learning applications. We present the Kymatio software package, an easy-to-use, high-performance Python implementation of the scattering transform in 1D, 2D, and 3D that is compatible with modern deep learning frameworks. All transforms may be executed on a GPU (in addition to CPU), offering a considerable speed up over CPU implementations. The package also has a small memory footprint, resulting inefficient memory usage. The source code, documentation, and examples are available undera BSD license at https://www.kymat.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.11214v3-abstract-full').style.display = 'none'; document.getElementById('1812.11214v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Belilovsky%2C+E&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Belilovsky%2C+E&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Belilovsky%2C+E&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10