Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 77 results for author: <span class="mathjax">Rish, I</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Rish%2C+I">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Rish, I"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Rish%2C+I&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Rish, I"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Rish%2C+I&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Rish%2C+I&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Rish%2C+I&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09672">arXiv:2501.09672</a> <span> [<a href="https://arxiv.org/pdf/2501.09672">pdf</a>, <a href="https://arxiv.org/format/2501.09672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP Evaluation Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Roger%2C+A">Alexis Roger</a>, <a href="/search/cs?searchtype=author&query=Humane%2C+P">Prateek Humane</a>, <a href="/search/cs?searchtype=author&query=Kaplan%2C+D+Z">Daniel Z. Kaplan</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+K">Kshitij Gupta</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qi Sun</a>, <a href="/search/cs?searchtype=author&query=Adamopoulos%2C+G">George Adamopoulos</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J+S+C">Jonathan Siu Chi Lim</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+Q">Quentin Anthony</a>, <a href="/search/cs?searchtype=author&query=Fennell%2C+E">Edwin Fennell</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09672v2-abstract-short" style="display: inline;"> The proliferation of Vision-Language Models (VLMs) in the past several years calls for rigorous and comprehensive evaluation methods and benchmarks. This work analyzes existing VLM evaluation techniques, including automated metrics, AI-based assessments, and human evaluations across diverse tasks. We first introduce Robin - a novel suite of VLMs that we built by combining Large Language Models (LL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09672v2-abstract-full').style.display = 'inline'; document.getElementById('2501.09672v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09672v2-abstract-full" style="display: none;"> The proliferation of Vision-Language Models (VLMs) in the past several years calls for rigorous and comprehensive evaluation methods and benchmarks. This work analyzes existing VLM evaluation techniques, including automated metrics, AI-based assessments, and human evaluations across diverse tasks. We first introduce Robin - a novel suite of VLMs that we built by combining Large Language Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use Robin to identify shortcomings of current evaluation approaches across scales. Next, to overcome the identified limitations, we introduce CHIRP - a new long form response benchmark we developed for more robust and complete VLM evaluation. We provide open access to the Robin training code, model suite, and CHIRP benchmark to promote reproducibility and advance VLM research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09672v2-abstract-full').style.display = 'none'; document.getElementById('2501.09672v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14355">arXiv:2412.14355</a> <span> [<a href="https://arxiv.org/pdf/2412.14355">pdf</a>, <a href="https://arxiv.org/format/2412.14355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enabling Realtime Reinforcement Learning at Scale with Staggered Asynchronous Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Riemer%2C+M">Matthew Riemer</a>, <a href="/search/cs?searchtype=author&query=Subbaraj%2C+G">Gopeshh Subbaraj</a>, <a href="/search/cs?searchtype=author&query=Berseth%2C+G">Glen Berseth</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14355v1-abstract-short" style="display: inline;"> Realtime environments change even as agents perform action inference and learning, thus requiring high interaction frequencies to effectively minimize regret. However, recent advances in machine learning involve larger neural networks with longer inference times, raising questions about their applicability in realtime systems where reaction time is crucial. We present an analysis of lower bounds o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14355v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14355v1-abstract-full" style="display: none;"> Realtime environments change even as agents perform action inference and learning, thus requiring high interaction frequencies to effectively minimize regret. However, recent advances in machine learning involve larger neural networks with longer inference times, raising questions about their applicability in realtime systems where reaction time is crucial. We present an analysis of lower bounds on regret in realtime reinforcement learning (RL) environments to show that minimizing long-term regret is generally impossible within the typical sequential interaction and learning paradigm, but often becomes possible when sufficient asynchronous compute is available. We propose novel algorithms for staggering asynchronous inference processes to ensure that actions are taken at consistent time intervals, and demonstrate that use of models with high action inference times is only constrained by the environment's effective stochasticity over the inference horizon, and not by action frequency. Our analysis shows that the number of inference processes needed scales linearly with increasing inference times while enabling use of models that are multiple orders of magnitude larger than existing approaches when learning from a realtime simulation of Game Boy games such as Pok茅mon and Tetris. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14355v1-abstract-full').style.display = 'none'; document.getElementById('2412.14355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12372">arXiv:2411.12372</a> <span> [<a href="https://arxiv.org/pdf/2411.12372">pdf</a>, <a href="https://arxiv.org/format/2411.12372">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RedPajama: an Open Dataset for Training Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weber%2C+M">Maurice Weber</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+D">Daniel Fu</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+Q">Quentin Anthony</a>, <a href="/search/cs?searchtype=author&query=Oren%2C+Y">Yonatan Oren</a>, <a href="/search/cs?searchtype=author&query=Adams%2C+S">Shane Adams</a>, <a href="/search/cs?searchtype=author&query=Alexandrov%2C+A">Anton Alexandrov</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xiaozhong Lyu</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+H">Huu Nguyen</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+X">Xiaozhe Yao</a>, <a href="/search/cs?searchtype=author&query=Adams%2C+V">Virginia Adams</a>, <a href="/search/cs?searchtype=author&query=Athiwaratkun%2C+B">Ben Athiwaratkun</a>, <a href="/search/cs?searchtype=author&query=Chalamala%2C+R">Rahul Chalamala</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kezhen Chen</a>, <a href="/search/cs?searchtype=author&query=Ryabinin%2C+M">Max Ryabinin</a>, <a href="/search/cs?searchtype=author&query=Dao%2C+T">Tri Dao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+P">Percy Liang</a>, <a href="/search/cs?searchtype=author&query=R%C3%A9%2C+C">Christopher R茅</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Ce Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12372v1-abstract-short" style="display: inline;"> Large language models are increasingly becoming a cornerstone technology in artificial intelligence, the sciences, and society as a whole, yet the optimal strategies for dataset composition and filtering remain largely elusive. Many of the top-performing models lack transparency in their dataset curation and model development processes, posing an obstacle to the development of fully open language… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12372v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12372v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12372v1-abstract-full" style="display: none;"> Large language models are increasingly becoming a cornerstone technology in artificial intelligence, the sciences, and society as a whole, yet the optimal strategies for dataset composition and filtering remain largely elusive. Many of the top-performing models lack transparency in their dataset curation and model development processes, posing an obstacle to the development of fully open language models. In this paper, we identify three core data-related challenges that must be addressed to advance open-source language models. These include (1) transparency in model development, including the data curation process, (2) access to large quantities of high-quality data, and (3) availability of artifacts and metadata for dataset curation and analysis. To address these challenges, we release RedPajama-V1, an open reproduction of the LLaMA training dataset. In addition, we release RedPajama-V2, a massive web-only dataset consisting of raw, unfiltered text data together with quality signals and metadata. Together, the RedPajama datasets comprise over 100 trillion tokens spanning multiple domains and with their quality signals facilitate the filtering of data, aiming to inspire the development of numerous new datasets. To date, these datasets have already been used in the training of strong language models used in production, such as Snowflake Arctic, Salesforce's XGen and AI2's OLMo. To provide insight into the quality of RedPajama, we present a series of analyses and ablation studies with decoder-only language models with up to 1.6B parameters. Our findings demonstrate how quality signals for web data can be effectively leveraged to curate high-quality subsets of the dataset, underscoring the potential of RedPajama to advance the development of transparent and high-performing language models at scale. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12372v1-abstract-full').style.display = 'none'; document.getElementById('2411.12372v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">38th Conference on Neural Information Processing Systems (NeurIPS 2024) Track on Datasets and Benchmarks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07007">arXiv:2411.07007</a> <span> [<a href="https://arxiv.org/pdf/2411.07007">pdf</a>, <a href="https://arxiv.org/format/2411.07007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Non-Adversarial Inverse Reinforcement Learning via Successor Feature Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jain%2C+A+K">Arnav Kumar Jain</a>, <a href="/search/cs?searchtype=author&query=Wiltzer%2C+H">Harley Wiltzer</a>, <a href="/search/cs?searchtype=author&query=Farebrother%2C+J">Jesse Farebrother</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Berseth%2C+G">Glen Berseth</a>, <a href="/search/cs?searchtype=author&query=Choudhury%2C+S">Sanjiban Choudhury</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07007v1-abstract-short" style="display: inline;"> In inverse reinforcement learning (IRL), an agent seeks to replicate expert demonstrations through interactions with the environment. Traditionally, IRL is treated as an adversarial game, where an adversary searches over reward models, and a learner optimizes the reward through repeated RL procedures. This game-solving approach is both computationally expensive and difficult to stabilize. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07007v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07007v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07007v1-abstract-full" style="display: none;"> In inverse reinforcement learning (IRL), an agent seeks to replicate expert demonstrations through interactions with the environment. Traditionally, IRL is treated as an adversarial game, where an adversary searches over reward models, and a learner optimizes the reward through repeated RL procedures. This game-solving approach is both computationally expensive and difficult to stabilize. In this work, we propose a novel approach to IRL by direct policy optimization: exploiting a linear factorization of the return as the inner product of successor features and a reward vector, we design an IRL algorithm by policy gradient descent on the gap between the learner and expert features. Our non-adversarial method does not require learning a reward function and can be solved seamlessly with existing actor-critic RL algorithms. Remarkably, our approach works in state-only settings without expert action labels, a setting which behavior cloning (BC) cannot solve. Empirical results demonstrate that our method learns from as few as a single expert demonstration and achieves improved performance on various control tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07007v1-abstract-full').style.display = 'none'; document.getElementById('2411.07007v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05830">arXiv:2411.05830</a> <span> [<a href="https://arxiv.org/pdf/2411.05830">pdf</a>, <a href="https://arxiv.org/format/2411.05830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GitChameleon: Unmasking the Version-Switching Capabilities of Code Generation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Islah%2C+N">Nizar Islah</a>, <a href="/search/cs?searchtype=author&query=Gehring%2C+J">Justine Gehring</a>, <a href="/search/cs?searchtype=author&query=Misra%2C+D">Diganta Misra</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+E">Eilif Muller</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+T+Y">Terry Yue Zhuo</a>, <a href="/search/cs?searchtype=author&query=Caccia%2C+M">Massimo Caccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05830v1-abstract-short" style="display: inline;"> The rapid evolution of software libraries presents a significant challenge for code generation models, which must adapt to frequent version updates while maintaining compatibility with previous versions. Existing code completion benchmarks often overlook this dynamic aspect, and the one that does consider it relies on static code prediction tasks without execution-based evaluation, offering a limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05830v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05830v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05830v1-abstract-full" style="display: none;"> The rapid evolution of software libraries presents a significant challenge for code generation models, which must adapt to frequent version updates while maintaining compatibility with previous versions. Existing code completion benchmarks often overlook this dynamic aspect, and the one that does consider it relies on static code prediction tasks without execution-based evaluation, offering a limited perspective on a model's practical usability. To address this gap, we introduce \textbf{\GitChameleon{}}, a novel, manually curated dataset comprising 116 Python code completion problems, each conditioned on specific library versions and accompanied by executable unit tests. \GitChameleon{} is designed to rigorously assess the ability of modern large language models (LLMs) to generate version-specific code that is not only syntactically correct but also functionally accurate upon execution. Our comprehensive evaluations reveal that state-of-the-art LLMs struggle with this task; for instance, \textbf{GPT-4o} achieves a pass@10 of only 39.9\% (43.7\% when provided with error feedback), highlighting the complexity of the problem and the limitations of current models. By providing an execution-based benchmark that emphasizes the dynamic nature of code libraries, \GitChameleon{} serves as a critical tool to advance the development of more adaptable and reliable code generation models. For facilitation for further exploration of version-conditioned code generation, we make our code repository publicly accessible at \url{https://github.com/NizarIslah/GitChameleon}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05830v1-abstract-full').style.display = 'none'; document.getElementById('2411.05830v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02344">arXiv:2411.02344</a> <span> [<a href="https://arxiv.org/pdf/2411.02344">pdf</a>, <a href="https://arxiv.org/format/2411.02344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Seq-VCR: Preventing Collapse in Intermediate Transformer Representations for Enhanced Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Arefin%2C+M+R">Md Rifat Arefin</a>, <a href="/search/cs?searchtype=author&query=Subbaraj%2C+G">Gopeshh Subbaraj</a>, <a href="/search/cs?searchtype=author&query=Gontier%2C+N">Nicolas Gontier</a>, <a href="/search/cs?searchtype=author&query=LeCun%2C+Y">Yann LeCun</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Shwartz-Ziv%2C+R">Ravid Shwartz-Ziv</a>, <a href="/search/cs?searchtype=author&query=Pal%2C+C">Christopher Pal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02344v1-abstract-short" style="display: inline;"> Decoder-only Transformers often struggle with complex reasoning tasks, particularly arithmetic reasoning requiring multiple sequential operations. In this work, we identify representation collapse in the model's intermediate layers as a key factor limiting their reasoning capabilities. To address this, we propose Sequential Variance-Covariance Regularization (Seq-VCR), which enhances the entropy o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02344v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02344v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02344v1-abstract-full" style="display: none;"> Decoder-only Transformers often struggle with complex reasoning tasks, particularly arithmetic reasoning requiring multiple sequential operations. In this work, we identify representation collapse in the model's intermediate layers as a key factor limiting their reasoning capabilities. To address this, we propose Sequential Variance-Covariance Regularization (Seq-VCR), which enhances the entropy of intermediate representations and prevents collapse. Combined with dummy pause tokens as substitutes for chain-of-thought (CoT) tokens, our method significantly improves performance in arithmetic reasoning problems. In the challenging $5 \times 5$ integer multiplication task, our approach achieves $99.5\%$ exact match accuracy, outperforming models of the same size (which yield $0\%$ accuracy) and GPT-4 with five-shot CoT prompting ($44\%$). We also demonstrate superior results on arithmetic expression and longest increasing subsequence (LIS) datasets. Our findings highlight the importance of preventing intermediate layer representation collapse to enhance the reasoning capabilities of Transformers and show that Seq-VCR offers an effective solution without requiring explicit CoT supervision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02344v1-abstract-full').style.display = 'none'; document.getElementById('2411.02344v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18959">arXiv:2410.18959</a> <span> [<a href="https://arxiv.org/pdf/2410.18959">pdf</a>, <a href="https://arxiv.org/format/2410.18959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Context is Key: A Benchmark for Forecasting with Essential Textual Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Williams%2C+A+R">Andrew Robert Williams</a>, <a href="/search/cs?searchtype=author&query=Ashok%2C+A">Arjun Ashok</a>, <a href="/search/cs?searchtype=author&query=Marcotte%2C+%C3%89">脡tienne Marcotte</a>, <a href="/search/cs?searchtype=author&query=Zantedeschi%2C+V">Valentina Zantedeschi</a>, <a href="/search/cs?searchtype=author&query=Subramanian%2C+J">Jithendaraa Subramanian</a>, <a href="/search/cs?searchtype=author&query=Riachi%2C+R">Roland Riachi</a>, <a href="/search/cs?searchtype=author&query=Requeima%2C+J">James Requeima</a>, <a href="/search/cs?searchtype=author&query=Lacoste%2C+A">Alexandre Lacoste</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Chapados%2C+N">Nicolas Chapados</a>, <a href="/search/cs?searchtype=author&query=Drouin%2C+A">Alexandre Drouin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18959v3-abstract-short" style="display: inline;"> Forecasting is a critical task in decision-making across numerous domains. While historical numerical data provide a start, they fail to convey the complete context for reliable and accurate predictions. Human forecasters frequently rely on additional information, such as background knowledge and constraints, which can efficiently be communicated through natural language. However, in spite of rece… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18959v3-abstract-full').style.display = 'inline'; document.getElementById('2410.18959v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18959v3-abstract-full" style="display: none;"> Forecasting is a critical task in decision-making across numerous domains. While historical numerical data provide a start, they fail to convey the complete context for reliable and accurate predictions. Human forecasters frequently rely on additional information, such as background knowledge and constraints, which can efficiently be communicated through natural language. However, in spite of recent progress with LLM-based forecasters, their ability to effectively integrate this textual information remains an open question. To address this, we introduce "Context is Key" (CiK), a time-series forecasting benchmark that pairs numerical data with diverse types of carefully crafted textual context, requiring models to integrate both modalities; crucially, every task in CiK requires understanding textual context to be solved successfully. We evaluate a range of approaches, including statistical models, time series foundation models, and LLM-based forecasters, and propose a simple yet effective LLM prompting method that outperforms all other tested methods on our benchmark. Our experiments highlight the importance of incorporating contextual information, demonstrate surprising performance when using LLM-based forecasting models, and also reveal some of their critical shortcomings. This benchmark aims to advance multimodal forecasting by promoting models that are both accurate and accessible to decision-makers with varied technical expertise. The benchmark can be visualized at https://servicenow.github.io/context-is-key-forecasting/v0/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18959v3-abstract-full').style.display = 'none'; document.getElementById('2410.18959v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint; under review. First two authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05817">arXiv:2409.05817</a> <span> [<a href="https://arxiv.org/pdf/2409.05817">pdf</a>, <a href="https://arxiv.org/format/2409.05817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> VFA: Vision Frequency Analysis of Foundation Models and Human </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Darvishi-Bayazi%2C+M">Mohammad-Javad Darvishi-Bayazi</a>, <a href="/search/cs?searchtype=author&query=Arefin%2C+M+R">Md Rifat Arefin</a>, <a href="/search/cs?searchtype=author&query=Faubert%2C+J">Jocelyn Faubert</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05817v1-abstract-short" style="display: inline;"> Machine learning models often struggle with distribution shifts in real-world scenarios, whereas humans exhibit robust adaptation. Models that better align with human perception may achieve higher out-of-distribution generalization. In this study, we investigate how various characteristics of large-scale computer vision models influence their alignment with human capabilities and robustness. Our f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05817v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05817v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05817v1-abstract-full" style="display: none;"> Machine learning models often struggle with distribution shifts in real-world scenarios, whereas humans exhibit robust adaptation. Models that better align with human perception may achieve higher out-of-distribution generalization. In this study, we investigate how various characteristics of large-scale computer vision models influence their alignment with human capabilities and robustness. Our findings indicate that increasing model and data size and incorporating rich semantic information and multiple modalities enhance models' alignment with human perception and their overall robustness. Our empirical analysis demonstrates a strong correlation between out-of-distribution accuracy and human alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05817v1-abstract-full').style.display = 'none'; document.getElementById('2409.05817v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12327">arXiv:2407.12327</a> <span> [<a href="https://arxiv.org/pdf/2407.12327">pdf</a>, <a href="https://arxiv.org/format/2407.12327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Spectra: Surprising Effectiveness of Pretraining Ternary Language Models at Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kaushal%2C+A">Ayush Kaushal</a>, <a href="/search/cs?searchtype=author&query=Vaidhya%2C+T">Tejas Vaidhya</a>, <a href="/search/cs?searchtype=author&query=Mondal%2C+A+K">Arnab Kumar Mondal</a>, <a href="/search/cs?searchtype=author&query=Pandey%2C+T">Tejas Pandey</a>, <a href="/search/cs?searchtype=author&query=Bhagat%2C+A">Aaryan Bhagat</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12327v5-abstract-short" style="display: inline;"> Rapid advancements in GPU computational power has outpaced memory capacity and bandwidth growth, creating bottlenecks in Large Language Model (LLM) inference. Post-training quantization is the leading method for addressing memory-related bottlenecks in LLM inference, but it suffers from significant performance degradation below 4-bit precision. This paper addresses these challenges by investigatin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12327v5-abstract-full').style.display = 'inline'; document.getElementById('2407.12327v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12327v5-abstract-full" style="display: none;"> Rapid advancements in GPU computational power has outpaced memory capacity and bandwidth growth, creating bottlenecks in Large Language Model (LLM) inference. Post-training quantization is the leading method for addressing memory-related bottlenecks in LLM inference, but it suffers from significant performance degradation below 4-bit precision. This paper addresses these challenges by investigating the pretraining of low-bitwidth models specifically Ternary Language Models (TriLMs) as an alternative to traditional floating-point models (FloatLMs) and their post-training quantized versions (QuantLMs). We present Spectra LLM suite, the first open suite of LLMs spanning multiple bit-widths, including FloatLMs, QuantLMs, and TriLMs, ranging from 99M to 3.9B parameters trained on 300B tokens. Our comprehensive evaluation demonstrates that TriLMs offer superior scaling behavior in terms of model size (in bits). Surprisingly, at scales exceeding one billion parameters, TriLMs consistently outperform their QuantLM and FloatLM counterparts for a given bit size across various benchmarks. Notably, the 3.9B parameter TriLM matches the performance of the FloatLM 3.9B across all benchmarks, despite having fewer bits than FloatLM 830M. Overall, this research provides valuable insights into the feasibility and scalability of low-bitwidth language models, paving the way for the development of more efficient LLMs. To enhance understanding of low-bitwidth models, we are releasing 500+ intermediate checkpoints of the Spectra suite at https://github.com/NolanoOrg/SpectraSuite. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12327v5-abstract-full').style.display = 'none'; document.getElementById('2407.12327v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">42 pages, 21 figures, and 13 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T30 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12161">arXiv:2407.12161</a> <span> [<a href="https://arxiv.org/pdf/2407.12161">pdf</a>, <a href="https://arxiv.org/format/2407.12161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Interpretability in Action: Exploratory Analysis of VPT, a Minecraft Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jucys%2C+K">Karolis Jucys</a>, <a href="/search/cs?searchtype=author&query=Adamopoulos%2C+G">George Adamopoulos</a>, <a href="/search/cs?searchtype=author&query=Hamidi%2C+M">Mehrab Hamidi</a>, <a href="/search/cs?searchtype=author&query=Milani%2C+S">Stephanie Milani</a>, <a href="/search/cs?searchtype=author&query=Samsami%2C+M+R">Mohammad Reza Samsami</a>, <a href="/search/cs?searchtype=author&query=Zholus%2C+A">Artem Zholus</a>, <a href="/search/cs?searchtype=author&query=Joseph%2C+S">Sonia Joseph</a>, <a href="/search/cs?searchtype=author&query=Richards%2C+B">Blake Richards</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=%C5%9Eim%C5%9Fek%2C+%C3%96">脰zg眉r 艦im艧ek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12161v1-abstract-short" style="display: inline;"> Understanding the mechanisms behind decisions taken by large foundation models in sequential decision making tasks is critical to ensuring that such systems operate transparently and safely. In this work, we perform exploratory analysis on the Video PreTraining (VPT) Minecraft playing agent, one of the largest open-source vision-based agents. We aim to illuminate its reasoning mechanisms by applyi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12161v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12161v1-abstract-full" style="display: none;"> Understanding the mechanisms behind decisions taken by large foundation models in sequential decision making tasks is critical to ensuring that such systems operate transparently and safely. In this work, we perform exploratory analysis on the Video PreTraining (VPT) Minecraft playing agent, one of the largest open-source vision-based agents. We aim to illuminate its reasoning mechanisms by applying various interpretability techniques. First, we analyze the attention mechanism while the agent solves its training task - crafting a diamond pickaxe. The agent pays attention to the last four frames and several key-frames further back in its six-second memory. This is a possible mechanism for maintaining coherence in a task that takes 3-10 minutes, despite the short memory span. Secondly, we perform various interventions, which help us uncover a worrying case of goal misgeneralization: VPT mistakenly identifies a villager wearing brown clothes as a tree trunk when the villager is positioned stationary under green tree leaves, and punches it to death. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12161v1-abstract-full').style.display = 'none'; document.getElementById('2407.12161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Mechanistic Interpretability Workshop at ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11121">arXiv:2407.11121</a> <span> [<a href="https://arxiv.org/pdf/2407.11121">pdf</a>, <a href="https://arxiv.org/format/2407.11121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Adversarially Robust Vision-Language Models: Insights from Design Choices and Prompt Formatting Techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhagwatkar%2C+R">Rishika Bhagwatkar</a>, <a href="/search/cs?searchtype=author&query=Nayak%2C+S">Shravan Nayak</a>, <a href="/search/cs?searchtype=author&query=Bayat%2C+R">Reza Bayat</a>, <a href="/search/cs?searchtype=author&query=Roger%2C+A">Alexis Roger</a>, <a href="/search/cs?searchtype=author&query=Kaplan%2C+D+Z">Daniel Z Kaplan</a>, <a href="/search/cs?searchtype=author&query=Bashivan%2C+P">Pouya Bashivan</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11121v1-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) have witnessed a surge in both research and real-world applications. However, as they are becoming increasingly prevalent, ensuring their robustness against adversarial attacks is paramount. This work systematically investigates the impact of model design choices on the adversarial robustness of VLMs against image-based attacks. Additionally, we introduce novel, cost-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11121v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11121v1-abstract-full" style="display: none;"> Vision-Language Models (VLMs) have witnessed a surge in both research and real-world applications. However, as they are becoming increasingly prevalent, ensuring their robustness against adversarial attacks is paramount. This work systematically investigates the impact of model design choices on the adversarial robustness of VLMs against image-based attacks. Additionally, we introduce novel, cost-effective approaches to enhance robustness through prompt formatting. By rephrasing questions and suggesting potential adversarial perturbations, we demonstrate substantial improvements in model robustness against strong image-based attacks such as Auto-PGD. Our findings provide important guidelines for developing more robust VLMs, particularly for deployment in safety-critical environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11121v1-abstract-full').style.display = 'none'; document.getElementById('2407.11121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04680">arXiv:2407.04680</a> <span> [<a href="https://arxiv.org/pdf/2407.04680">pdf</a>, <a href="https://arxiv.org/ps/2407.04680">ps</a>, <a href="https://arxiv.org/format/2407.04680">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Lost in Translation: The Algorithmic Gap Between LMs and the Brain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tosato%2C+T">Tommaso Tosato</a>, <a href="/search/cs?searchtype=author&query=Notsawo%2C+P+J+T">Pascal Jr Tikeng Notsawo</a>, <a href="/search/cs?searchtype=author&query=Helbling%2C+S">Saskia Helbling</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Dumas%2C+G">Guillaume Dumas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04680v1-abstract-short" style="display: inline;"> Language Models (LMs) have achieved impressive performance on various linguistic tasks, but their relationship to human language processing in the brain remains unclear. This paper examines the gaps and overlaps between LMs and the brain at different levels of analysis, emphasizing the importance of looking beyond input-output behavior to examine and compare the internal processes of these systems… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04680v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04680v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04680v1-abstract-full" style="display: none;"> Language Models (LMs) have achieved impressive performance on various linguistic tasks, but their relationship to human language processing in the brain remains unclear. This paper examines the gaps and overlaps between LMs and the brain at different levels of analysis, emphasizing the importance of looking beyond input-output behavior to examine and compare the internal processes of these systems. We discuss how insights from neuroscience, such as sparsity, modularity, internal states, and interactive learning, can inform the development of more biologically plausible language models. Furthermore, we explore the role of scaling laws in bridging the gap between LMs and human cognition, highlighting the need for efficiency constraints analogous to those in biological systems. By developing LMs that more closely mimic brain function, we aim to advance both artificial intelligence and our understanding of human cognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04680v1-abstract-full').style.display = 'none'; document.getElementById('2407.04680v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00153">arXiv:2406.00153</a> <span> [<a href="https://arxiv.org/pdf/2406.00153">pdf</a>, <a href="https://arxiv.org/format/2406.00153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> $渭$LO: Compute-Efficient Meta-Generalization of Learned Optimizers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Th%C3%A9rien%2C+B">Benjamin Th茅rien</a>, <a href="/search/cs?searchtype=author&query=Joseph%2C+C">Charles-脡tienne Joseph</a>, <a href="/search/cs?searchtype=author&query=Knyazev%2C+B">Boris Knyazev</a>, <a href="/search/cs?searchtype=author&query=Oyallon%2C+E">Edouard Oyallon</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00153v2-abstract-short" style="display: inline;"> Learned optimizers (LOs) can significantly reduce the wall-clock training time of neural networks, substantially reducing training costs. However, they can struggle to optimize unseen tasks (meta-generalize), especially when training networks much larger than those seen during meta-training. To address this, we derive the Maximal Update Parametrization ($渭$P) for two popular learned optimizer arch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00153v2-abstract-full').style.display = 'inline'; document.getElementById('2406.00153v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00153v2-abstract-full" style="display: none;"> Learned optimizers (LOs) can significantly reduce the wall-clock training time of neural networks, substantially reducing training costs. However, they can struggle to optimize unseen tasks (meta-generalize), especially when training networks much larger than those seen during meta-training. To address this, we derive the Maximal Update Parametrization ($渭$P) for two popular learned optimizer architectures and propose a simple meta-training recipe for $渭$-parameterized LOs ($渭$LOs). Our empirical evaluation demonstrates that LOs meta-trained with our recipe substantially improve meta-generalization to wider unseen tasks when compared to LOs trained under standard parametrization (e.g., as they are trained in existing work). When applying our $渭$LOs, each trained for less than 250 GPU-hours, to large-width models we are often able to match or exceed the performance of pre-trained VeLO, the most performant publicly available learned optimizer, meta-trained with 4000 TPU-months of compute. We also observe that learned optimizers trained with our $渭$LO recipe also exhibit substantially improved meta-generalization to deeper networks ($5\times$ meta-training) and remarkable generalization to much longer training horizons ($25\times$ meta-training). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00153v2-abstract-full').style.display = 'none'; document.getElementById('2406.00153v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07377">arXiv:2404.07377</a> <span> [<a href="https://arxiv.org/pdf/2404.07377">pdf</a>, <a href="https://arxiv.org/format/2404.07377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Deep Generative Sampling in the Dual Divergence Space: A Data-efficient & Interpretative Approach for Generative AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sahil Garg</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+A">Anderson Schneider</a>, <a href="/search/cs?searchtype=author&query=Raj%2C+A">Anant Raj</a>, <a href="/search/cs?searchtype=author&query=Rasul%2C+K">Kashif Rasul</a>, <a href="/search/cs?searchtype=author&query=Nevmyvaka%2C+Y">Yuriy Nevmyvaka</a>, <a href="/search/cs?searchtype=author&query=Gopal%2C+S">Sneihil Gopal</a>, <a href="/search/cs?searchtype=author&query=Dhurandhar%2C+A">Amit Dhurandhar</a>, <a href="/search/cs?searchtype=author&query=Cecchi%2C+G">Guillermo Cecchi</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07377v1-abstract-short" style="display: inline;"> Building on the remarkable achievements in generative sampling of natural images, we propose an innovative challenge, potentially overly ambitious, which involves generating samples of entire multivariate time series that resemble images. However, the statistical challenge lies in the small sample size, sometimes consisting of a few hundred subjects. This issue is especially problematic for deep g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07377v1-abstract-full').style.display = 'inline'; document.getElementById('2404.07377v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07377v1-abstract-full" style="display: none;"> Building on the remarkable achievements in generative sampling of natural images, we propose an innovative challenge, potentially overly ambitious, which involves generating samples of entire multivariate time series that resemble images. However, the statistical challenge lies in the small sample size, sometimes consisting of a few hundred subjects. This issue is especially problematic for deep generative models that follow the conventional approach of generating samples from a canonical distribution and then decoding or denoising them to match the true data distribution. In contrast, our method is grounded in information theory and aims to implicitly characterize the distribution of images, particularly the (global and local) dependency structure between pixels. We achieve this by empirically estimating its KL-divergence in the dual form with respect to the respective marginal distribution. This enables us to perform generative sampling directly in the optimized 1-D dual divergence space. Specifically, in the dual space, training samples representing the data distribution are embedded in the form of various clusters between two end points. In theory, any sample embedded between those two end points is in-distribution w.r.t. the data distribution. Our key idea for generating novel samples of images is to interpolate between the clusters via a walk as per gradients of the dual function w.r.t. the data dimensions. In addition to the data efficiency gained from direct sampling, we propose an algorithm that offers a significant reduction in sample complexity for estimating the divergence of the data distribution with respect to the marginal distribution. We provide strong theoretical guarantees along with an extensive empirical evaluation using many real-world datasets from diverse domains, establishing the superiority of our approach w.r.t. state-of-the-art deep learning methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07377v1-abstract-full').style.display = 'none'; document.getElementById('2404.07377v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08763">arXiv:2403.08763</a> <span> [<a href="https://arxiv.org/pdf/2403.08763">pdf</a>, <a href="https://arxiv.org/format/2403.08763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Simple and Scalable Strategies to Continually Pre-train Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ibrahim%2C+A">Adam Ibrahim</a>, <a href="/search/cs?searchtype=author&query=Th%C3%A9rien%2C+B">Benjamin Th茅rien</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+K">Kshitij Gupta</a>, <a href="/search/cs?searchtype=author&query=Richter%2C+M+L">Mats L. Richter</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+Q">Quentin Anthony</a>, <a href="/search/cs?searchtype=author&query=Lesort%2C+T">Timoth茅e Lesort</a>, <a href="/search/cs?searchtype=author&query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08763v4-abstract-short" style="display: inline;"> Large language models (LLMs) are routinely pre-trained on billions of tokens, only to start the process over again once new data becomes available. A much more efficient solution is to continually pre-train these models, saving significant compute compared to re-training. However, the distribution shift induced by new data typically results in degraded performance on previous data or poor adaptati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08763v4-abstract-full').style.display = 'inline'; document.getElementById('2403.08763v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08763v4-abstract-full" style="display: none;"> Large language models (LLMs) are routinely pre-trained on billions of tokens, only to start the process over again once new data becomes available. A much more efficient solution is to continually pre-train these models, saving significant compute compared to re-training. However, the distribution shift induced by new data typically results in degraded performance on previous data or poor adaptation to the new data. In this work, we show that a simple and scalable combination of learning rate (LR) re-warming, LR re-decaying, and replay of previous data is sufficient to match the performance of fully re-training from scratch on all available data, as measured by the final loss and the average score on several language model (LM) evaluation benchmarks. Specifically, we show this for a weak but realistic distribution shift between two commonly used LLM pre-training datasets (English$\rightarrow$English) and a stronger distribution shift (English$\rightarrow$German) at the $405$M parameter model scale with large dataset sizes (hundreds of billions of tokens). Selecting the weak but realistic shift for larger-scale experiments, we also find that our continual learning strategies match the re-training baseline for a 10B parameter LLM. Our results demonstrate that LLMs can be successfully updated via simple and scalable continual learning strategies, matching the re-training baseline using only a fraction of the compute. Finally, inspired by previous work, we propose alternatives to the cosine learning rate schedule that help circumvent forgetting induced by LR re-warming and that are not bound to a fixed token budget. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08763v4-abstract-full').style.display = 'none'; document.getElementById('2403.08763v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.13368">arXiv:2402.13368</a> <span> [<a href="https://arxiv.org/pdf/2402.13368">pdf</a>, <a href="https://arxiv.org/format/2402.13368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Concept Discovery Mitigates Spurious Correlations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Arefin%2C+M+R">Md Rifat Arefin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Baratin%2C+A">Aristide Baratin</a>, <a href="/search/cs?searchtype=author&query=Locatello%2C+F">Francesco Locatello</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dianbo Liu</a>, <a href="/search/cs?searchtype=author&query=Kawaguchi%2C+K">Kenji Kawaguchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.13368v2-abstract-short" style="display: inline;"> Models prone to spurious correlations in training data often produce brittle predictions and introduce unintended biases. Addressing this challenge typically involves methods relying on prior knowledge and group annotation to remove spurious correlations, which may not be readily available in many applications. In this paper, we establish a novel connection between unsupervised object-centric lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13368v2-abstract-full').style.display = 'inline'; document.getElementById('2402.13368v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.13368v2-abstract-full" style="display: none;"> Models prone to spurious correlations in training data often produce brittle predictions and introduce unintended biases. Addressing this challenge typically involves methods relying on prior knowledge and group annotation to remove spurious correlations, which may not be readily available in many applications. In this paper, we establish a novel connection between unsupervised object-centric learning and mitigation of spurious correlations. Instead of directly inferring subgroups with varying correlations with labels, our approach focuses on discovering concepts: discrete ideas that are shared across input samples. Leveraging existing object-centric representation learning, we introduce CoBalT: a concept balancing technique that effectively mitigates spurious correlations without requiring human labeling of subgroups. Evaluation across the benchmark datasets for sub-population shifts demonstrate superior or competitive performance compared state-of-the-art baselines, without the need for group annotation. Code is available at https://github.com/rarefin/CoBalT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13368v2-abstract-full').style.display = 'none'; document.getElementById('2402.13368v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICLM 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.12868">arXiv:2312.12868</a> <span> [<a href="https://arxiv.org/pdf/2312.12868">pdf</a>, <a href="https://arxiv.org/format/2312.12868">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Towards Machines that Trust: AI Agents Learn to Trust in the Trust Game </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nobandegani%2C+A+S">Ardavan S. Nobandegani</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Shultz%2C+T+R">Thomas R. Shultz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.12868v1-abstract-short" style="display: inline;"> Widely considered a cornerstone of human morality, trust shapes many aspects of human social interactions. In this work, we present a theoretical analysis of the $\textit{trust game}$, the canonical task for studying trust in behavioral and brain sciences, along with simulation results supporting our analysis. Specifically, leveraging reinforcement learning (RL) to train our AI agents, we systemat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12868v1-abstract-full').style.display = 'inline'; document.getElementById('2312.12868v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.12868v1-abstract-full" style="display: none;"> Widely considered a cornerstone of human morality, trust shapes many aspects of human social interactions. In this work, we present a theoretical analysis of the $\textit{trust game}$, the canonical task for studying trust in behavioral and brain sciences, along with simulation results supporting our analysis. Specifically, leveraging reinforcement learning (RL) to train our AI agents, we systematically investigate learning trust under various parameterizations of this task. Our theoretical analysis, corroborated by the simulations results presented, provides a mathematical basis for the emergence of trust in the trust game. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12868v1-abstract-full').style.display = 'none'; document.getElementById('2312.12868v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.08278">arXiv:2310.08278</a> <span> [<a href="https://arxiv.org/pdf/2310.08278">pdf</a>, <a href="https://arxiv.org/format/2310.08278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Lag-Llama: Towards Foundation Models for Probabilistic Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rasul%2C+K">Kashif Rasul</a>, <a href="/search/cs?searchtype=author&query=Ashok%2C+A">Arjun Ashok</a>, <a href="/search/cs?searchtype=author&query=Williams%2C+A+R">Andrew Robert Williams</a>, <a href="/search/cs?searchtype=author&query=Ghonia%2C+H">Hena Ghonia</a>, <a href="/search/cs?searchtype=author&query=Bhagwatkar%2C+R">Rishika Bhagwatkar</a>, <a href="/search/cs?searchtype=author&query=Khorasani%2C+A">Arian Khorasani</a>, <a href="/search/cs?searchtype=author&query=Bayazi%2C+M+J+D">Mohammad Javad Darvishi Bayazi</a>, <a href="/search/cs?searchtype=author&query=Adamopoulos%2C+G">George Adamopoulos</a>, <a href="/search/cs?searchtype=author&query=Riachi%2C+R">Roland Riachi</a>, <a href="/search/cs?searchtype=author&query=Hassen%2C+N">Nadhir Hassen</a>, <a href="/search/cs?searchtype=author&query=Bilo%C5%A1%2C+M">Marin Bilo拧</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sahil Garg</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+A">Anderson Schneider</a>, <a href="/search/cs?searchtype=author&query=Chapados%2C+N">Nicolas Chapados</a>, <a href="/search/cs?searchtype=author&query=Drouin%2C+A">Alexandre Drouin</a>, <a href="/search/cs?searchtype=author&query=Zantedeschi%2C+V">Valentina Zantedeschi</a>, <a href="/search/cs?searchtype=author&query=Nevmyvaka%2C+Y">Yuriy Nevmyvaka</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.08278v3-abstract-short" style="display: inline;"> Over the past years, foundation models have caused a paradigm shift in machine learning due to their unprecedented capabilities for zero-shot and few-shot generalization. However, despite the success of foundation models in modalities such as natural language processing and computer vision, the development of foundation models for time series forecasting has lagged behind. We present Lag-Llama, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08278v3-abstract-full').style.display = 'inline'; document.getElementById('2310.08278v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.08278v3-abstract-full" style="display: none;"> Over the past years, foundation models have caused a paradigm shift in machine learning due to their unprecedented capabilities for zero-shot and few-shot generalization. However, despite the success of foundation models in modalities such as natural language processing and computer vision, the development of foundation models for time series forecasting has lagged behind. We present Lag-Llama, a general-purpose foundation model for univariate probabilistic time series forecasting based on a decoder-only transformer architecture that uses lags as covariates. Lag-Llama is pretrained on a large corpus of diverse time series data from several domains, and demonstrates strong zero-shot generalization capabilities compared to a wide range of forecasting models on downstream datasets across domains. Moreover, when fine-tuned on relatively small fractions of such previously unseen datasets, Lag-Llama achieves state-of-the-art performance, outperforming prior deep learning approaches, emerging as the best general-purpose model on average. Lag-Llama serves as a strong contender to the current state-of-art in time series forecasting and paves the way for future advancements in foundation models tailored to time series data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08278v3-abstract-full').style.display = 'none'; document.getElementById('2310.08278v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First two authors contributed equally. All data, models and code used are open-source. GitHub: https://github.com/time-series-foundation-models/lag-llama</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14021">arXiv:2309.14021</a> <span> [<a href="https://arxiv.org/pdf/2309.14021">pdf</a>, <a href="https://arxiv.org/format/2309.14021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LORD: Low Rank Decomposition Of Monolingual Code LLMs For One-Shot Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kaushal%2C+A">Ayush Kaushal</a>, <a href="/search/cs?searchtype=author&query=Vaidhya%2C+T">Tejas Vaidhya</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14021v1-abstract-short" style="display: inline;"> Low Rank Decomposition of matrix - splitting a large matrix into a product of two smaller matrix offers a means for compression that reduces the parameters of a model without sparsification, and hence delivering more speedup on modern hardware. Moreover, unlike quantization, the compressed linear layers remain fully differentiable and all the parameters trainable, while being able to leverage the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14021v1-abstract-full').style.display = 'inline'; document.getElementById('2309.14021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14021v1-abstract-full" style="display: none;"> Low Rank Decomposition of matrix - splitting a large matrix into a product of two smaller matrix offers a means for compression that reduces the parameters of a model without sparsification, and hence delivering more speedup on modern hardware. Moreover, unlike quantization, the compressed linear layers remain fully differentiable and all the parameters trainable, while being able to leverage the existing highly efficient kernels over floating point matrices. We study the potential to compress Large Language Models (LLMs) for monolingual Code generation via Low Rank Decomposition (LoRD) and observe that ranks for the linear layers in these models can be reduced by upto 39.58% with less than 1% increase in perplexity. We then use Low Rank Decomposition (LoRD) to compress StarCoder 16B to 13.2B parameter with no drop and to 12.3B with minimal drop in HumanEval Pass@1 score, in less than 10 minutes on a single A100. The compressed models speeds up inference by up to 22.35% with just a single line of change in code over huggingface's implementation with pytorch backend. Low Rank Decomposition (LoRD) models remain compatible with state of the art near-lossless quantization method such as SpQR, which allows leveraging further compression gains of quantization. Lastly, QLoRA over Low Rank Decomposition (LoRD) model further reduces memory requirements by as much as 21.2% over vanilla QLoRA while offering similar gains from parameter efficient fine tuning. Our work shows Low Rank Decomposition (LoRD) as a promising new paradigm for LLM compression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14021v1-abstract-full').style.display = 'none'; document.getElementById('2309.14021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.10910">arXiv:2309.10910</a> <span> [<a href="https://arxiv.org/pdf/2309.10910">pdf</a>, <a href="https://arxiv.org/format/2309.10910">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.compbiomed.2023.107893">10.1016/j.compbiomed.2023.107893 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Amplifying Pathological Detection in EEG Signaling Pathways through Cross-Dataset Transfer Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Darvishi-Bayazi%2C+M">Mohammad-Javad Darvishi-Bayazi</a>, <a href="/search/cs?searchtype=author&query=Ghaemi%2C+M+S">Mohammad Sajjad Ghaemi</a>, <a href="/search/cs?searchtype=author&query=Lesort%2C+T">Timothee Lesort</a>, <a href="/search/cs?searchtype=author&query=Arefin%2C+M+R">Md Rifat Arefin</a>, <a href="/search/cs?searchtype=author&query=Faubert%2C+J">Jocelyn Faubert</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.10910v1-abstract-short" style="display: inline;"> Pathology diagnosis based on EEG signals and decoding brain activity holds immense importance in understanding neurological disorders. With the advancement of artificial intelligence methods and machine learning techniques, the potential for accurate data-driven diagnoses and effective treatments has grown significantly. However, applying machine learning algorithms to real-world datasets presents… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10910v1-abstract-full').style.display = 'inline'; document.getElementById('2309.10910v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.10910v1-abstract-full" style="display: none;"> Pathology diagnosis based on EEG signals and decoding brain activity holds immense importance in understanding neurological disorders. With the advancement of artificial intelligence methods and machine learning techniques, the potential for accurate data-driven diagnoses and effective treatments has grown significantly. However, applying machine learning algorithms to real-world datasets presents diverse challenges at multiple levels. The scarcity of labelled data, especially in low regime scenarios with limited availability of real patient cohorts due to high costs of recruitment, underscores the vital deployment of scaling and transfer learning techniques. In this study, we explore a real-world pathology classification task to highlight the effectiveness of data and model scaling and cross-dataset knowledge transfer. As such, we observe varying performance improvements through data scaling, indicating the need for careful evaluation and labelling. Additionally, we identify the challenges of possible negative transfer and emphasize the significance of some key components to overcome distribution shifts and potential spurious correlations and achieve positive transfer. We see improvement in the performance of the target model on the target (NMT) datasets by using the knowledge from the source dataset (TUAB) when a low amount of labelled data was available. Our findings indicate a small and generic model (e.g. ShallowNet) performs well on a single dataset, however, a larger model (e.g. TCN) performs better on transfer and learning from a larger and diverse dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10910v1-abstract-full').style.display = 'none'; document.getElementById('2309.10910v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.04014">arXiv:2308.04014</a> <span> [<a href="https://arxiv.org/pdf/2308.04014">pdf</a>, <a href="https://arxiv.org/format/2308.04014">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Continual Pre-Training of Large Language Models: How to (re)warm your model? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+K">Kshitij Gupta</a>, <a href="/search/cs?searchtype=author&query=Th%C3%A9rien%2C+B">Benjamin Th茅rien</a>, <a href="/search/cs?searchtype=author&query=Ibrahim%2C+A">Adam Ibrahim</a>, <a href="/search/cs?searchtype=author&query=Richter%2C+M+L">Mats L. Richter</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+Q">Quentin Anthony</a>, <a href="/search/cs?searchtype=author&query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Lesort%2C+T">Timoth茅e Lesort</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.04014v2-abstract-short" style="display: inline;"> Large language models (LLMs) are routinely pre-trained on billions of tokens, only to restart the process over again once new data becomes available. A much cheaper and more efficient solution would be to enable the continual pre-training of these models, i.e. updating pre-trained models with new data instead of re-training them from scratch. However, the distribution shift induced by novel data t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04014v2-abstract-full').style.display = 'inline'; document.getElementById('2308.04014v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.04014v2-abstract-full" style="display: none;"> Large language models (LLMs) are routinely pre-trained on billions of tokens, only to restart the process over again once new data becomes available. A much cheaper and more efficient solution would be to enable the continual pre-training of these models, i.e. updating pre-trained models with new data instead of re-training them from scratch. However, the distribution shift induced by novel data typically results in degraded performance on past data. Taking a step towards efficient continual pre-training, in this work, we examine the effect of different warm-up strategies. Our hypothesis is that the learning rate must be re-increased to improve compute efficiency when training on a new dataset. We study the warmup phase of models pre-trained on the Pile (upstream data, 300B tokens) as we continue to pre-train on SlimPajama (downstream data, 297B tokens), following a linear warmup and cosine decay schedule. We conduct all experiments on the Pythia 410M language model architecture and evaluate performance through validation perplexity. We experiment with different pre-training checkpoints, various maximum learning rates, and various warmup lengths. Our results show that while rewarming models first increases the loss on upstream and downstream data, in the longer run it improves the downstream performance, outperforming models trained from scratch$\unicode{x2013}$even for a large downstream dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04014v2-abstract-full').style.display = 'none'; document.getElementById('2308.04014v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.05735">arXiv:2307.05735</a> <span> [<a href="https://arxiv.org/pdf/2307.05735">pdf</a>, <a href="https://arxiv.org/format/2307.05735">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chaotic Dynamics">nlin.CD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Effective Latent Differential Equation Models via Attention and Multiple Shooting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abrevaya%2C+G">Germ谩n Abrevaya</a>, <a href="/search/cs?searchtype=author&query=Ramezanian-Panahi%2C+M">Mahta Ramezanian-Panahi</a>, <a href="/search/cs?searchtype=author&query=Gagnon-Audet%2C+J">Jean-Christophe Gagnon-Audet</a>, <a href="/search/cs?searchtype=author&query=Polosecki%2C+P">Pablo Polosecki</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Dawson%2C+S+P">Silvina Ponce Dawson</a>, <a href="/search/cs?searchtype=author&query=Cecchi%2C+G">Guillermo Cecchi</a>, <a href="/search/cs?searchtype=author&query=Dumas%2C+G">Guillaume Dumas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.05735v3-abstract-short" style="display: inline;"> Scientific Machine Learning (SciML) is a burgeoning field that synergistically combines domain-aware and interpretable models with agnostic machine learning techniques. In this work, we introduce GOKU-UI, an evolution of the SciML generative model GOKU-nets. GOKU-UI not only broadens the original model's spectrum to incorporate other classes of differential equations, such as Stochastic Differenti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.05735v3-abstract-full').style.display = 'inline'; document.getElementById('2307.05735v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.05735v3-abstract-full" style="display: none;"> Scientific Machine Learning (SciML) is a burgeoning field that synergistically combines domain-aware and interpretable models with agnostic machine learning techniques. In this work, we introduce GOKU-UI, an evolution of the SciML generative model GOKU-nets. GOKU-UI not only broadens the original model's spectrum to incorporate other classes of differential equations, such as Stochastic Differential Equations (SDEs), but also integrates attention mechanisms and a novel multiple shooting training strategy in the latent space. These modifications have led to a significant increase in its performance in both reconstruction and forecast tasks, as demonstrated by our evaluation of simulated and empirical data. Specifically, GOKU-UI outperformed all baseline models on synthetic datasets even with a training set 16-fold smaller, underscoring its remarkable data efficiency. Furthermore, when applied to empirical human brain data, while incorporating stochastic Stuart-Landau oscillators into its dynamical core, our proposed enhancements markedly increased the model's effectiveness in capturing complex brain dynamics. This augmented version not only surpassed all baseline methods in the reconstruction task, but also demonstrated lower prediction error of future brain activity up to 15 seconds ahead. By training GOKU-UI on resting state fMRI data, we encoded whole-brain dynamics into a latent representation, learning a low-dimensional dynamical system model that could offer insights into brain functionality and open avenues for practical applications such as the classification of mental states or psychiatric conditions. Ultimately, our research provides further impetus for the field of Scientific Machine Learning, showcasing the potential for advancements when established scientific insights are interwoven with modern machine learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.05735v3-abstract-full').style.display = 'none'; document.getElementById('2307.05735v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.14808">arXiv:2306.14808</a> <span> [<a href="https://arxiv.org/pdf/2306.14808">pdf</a>, <a href="https://arxiv.org/format/2306.14808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Maximum State Entropy Exploration using Predecessor and Successor Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jain%2C+A+K">Arnav Kumar Jain</a>, <a href="/search/cs?searchtype=author&query=Lehnert%2C+L">Lucas Lehnert</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Berseth%2C+G">Glen Berseth</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.14808v1-abstract-short" style="display: inline;"> Animals have a developed ability to explore that aids them in important tasks such as locating food, exploring for shelter, and finding misplaced items. These exploration skills necessarily track where they have been so that they can plan for finding items with relative efficiency. Contemporary exploration algorithms often learn a less efficient exploration strategy because they either condition o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.14808v1-abstract-full').style.display = 'inline'; document.getElementById('2306.14808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.14808v1-abstract-full" style="display: none;"> Animals have a developed ability to explore that aids them in important tasks such as locating food, exploring for shelter, and finding misplaced items. These exploration skills necessarily track where they have been so that they can plan for finding items with relative efficiency. Contemporary exploration algorithms often learn a less efficient exploration strategy because they either condition only on the current state or simply rely on making random open-loop exploratory moves. In this work, we propose $畏蠄$-Learning, a method to learn efficient exploratory policies by conditioning on past episodic experience to make the next exploratory move. Specifically, $畏蠄$-Learning learns an exploration policy that maximizes the entropy of the state visitation distribution of a single trajectory. Furthermore, we demonstrate how variants of the predecessor representation and successor representations can be combined to predict the state visitation entropy. Our experiments demonstrate the efficacy of $畏蠄$-Learning to strategically explore the environment and maximize the state coverage with limited samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.14808v1-abstract-full').style.display = 'none'; document.getElementById('2306.14808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.13253">arXiv:2306.13253</a> <span> [<a href="https://arxiv.org/pdf/2306.13253">pdf</a>, <a href="https://arxiv.org/format/2306.13253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Predicting Grokking Long Before it Happens: A look into the loss landscape of models which grok </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Notsawo%2C+P+J+T">Pascal Jr. Tikeng Notsawo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hattie Zhou</a>, <a href="/search/cs?searchtype=author&query=Pezeshki%2C+M">Mohammad Pezeshki</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Dumas%2C+G">Guillaume Dumas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.13253v3-abstract-short" style="display: inline;"> This paper focuses on predicting the occurrence of grokking in neural networks, a phenomenon in which perfect generalization emerges long after signs of overfitting or memorization are observed. It has been reported that grokking can only be observed with certain hyper-parameters. This makes it critical to identify the parameters that lead to grokking. However, since grokking occurs after a large… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.13253v3-abstract-full').style.display = 'inline'; document.getElementById('2306.13253v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.13253v3-abstract-full" style="display: none;"> This paper focuses on predicting the occurrence of grokking in neural networks, a phenomenon in which perfect generalization emerges long after signs of overfitting or memorization are observed. It has been reported that grokking can only be observed with certain hyper-parameters. This makes it critical to identify the parameters that lead to grokking. However, since grokking occurs after a large number of epochs, searching for the hyper-parameters that lead to it is time-consuming. In this paper, we propose a low-cost method to predict grokking without training for a large number of epochs. In essence, by studying the learning curve of the first few epochs, we show that one can predict whether grokking will occur later on. Specifically, if certain oscillations occur in the early epochs, one can expect grokking to occur if the model is trained for a much longer period of time. We propose using the spectral signature of a learning curve derived by applying the Fourier transform to quantify the amplitude of low-frequency components to detect the presence of such oscillations. We also present additional experiments aimed at explaining the cause of these oscillations and characterizing the loss landscape. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.13253v3-abstract-full').style.display = 'none'; document.getElementById('2306.13253v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 30 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.13765">arXiv:2304.13765</a> <span> [<a href="https://arxiv.org/pdf/2304.13765">pdf</a>, <a href="https://arxiv.org/format/2304.13765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards ethical multimodal systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Roger%2C+A">Alexis Roger</a>, <a href="/search/cs?searchtype=author&query=A%C3%AFmeur%2C+E">Esma A茂meur</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.13765v3-abstract-short" style="display: inline;"> Generative AI systems (ChatGPT, DALL-E, etc) are expanding into multiple areas of our lives, from art Rombach et al. [2021] to mental health Rob Morris and Kareem Kouddous [2022]; their rapidly growing societal impact opens new opportunities, but also raises ethical concerns. The emerging field of AI alignment aims to make AI systems reflect human values. This paper focuses on evaluating the ethic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13765v3-abstract-full').style.display = 'inline'; document.getElementById('2304.13765v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.13765v3-abstract-full" style="display: none;"> Generative AI systems (ChatGPT, DALL-E, etc) are expanding into multiple areas of our lives, from art Rombach et al. [2021] to mental health Rob Morris and Kareem Kouddous [2022]; their rapidly growing societal impact opens new opportunities, but also raises ethical concerns. The emerging field of AI alignment aims to make AI systems reflect human values. This paper focuses on evaluating the ethics of multimodal AI systems involving both text and images - a relatively under-explored area, as most alignment work is currently focused on language models. We first create a multimodal ethical database from human feedback on ethicality. Then, using this database, we develop algorithms, including a RoBERTa-large classifier and a multilayer perceptron, to automatically assess the ethicality of system responses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13765v3-abstract-full').style.display = 'none'; document.getElementById('2304.13765v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, multimodal ethical dataset building, accepted in the NeurIPS 2023 MP2 workshop</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.01067">arXiv:2302.01067</a> <span> [<a href="https://arxiv.org/pdf/2302.01067">pdf</a>, <a href="https://arxiv.org/format/2302.01067">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Symbolic Computation">cs.SC</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Compositional Generalization in Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Baihan Lin</a>, <a href="/search/cs?searchtype=author&query=Bouneffouf%2C+D">Djallel Bouneffouf</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.01067v1-abstract-short" style="display: inline;"> The field of compositional generalization is currently experiencing a renaissance in AI, as novel problem settings and algorithms motivated by various practical applications are being introduced, building on top of the classical compositional generalization problem. This article aims to provide a comprehensive review of top recent developments in multiple real-life applications of the compositiona… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.01067v1-abstract-full').style.display = 'inline'; document.getElementById('2302.01067v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.01067v1-abstract-full" style="display: none;"> The field of compositional generalization is currently experiencing a renaissance in AI, as novel problem settings and algorithms motivated by various practical applications are being introduced, building on top of the classical compositional generalization problem. This article aims to provide a comprehensive review of top recent developments in multiple real-life applications of the compositional generalization. Specifically, we introduce a taxonomy of common applications and summarize the state-of-the-art for each of those domains. Furthermore, we identify important current trends and provide new perspectives pertaining to the future of this burgeoning field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.01067v1-abstract-full').style.display = 'none'; document.getElementById('2302.01067v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.04742">arXiv:2211.04742</a> <span> [<a href="https://arxiv.org/pdf/2211.04742">pdf</a>, <a href="https://arxiv.org/format/2211.04742">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Knowledge Distillation for Federated Learning: a Practical Guide </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mora%2C+A">Alessio Mora</a>, <a href="/search/cs?searchtype=author&query=Tenison%2C+I">Irene Tenison</a>, <a href="/search/cs?searchtype=author&query=Bellavista%2C+P">Paolo Bellavista</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.04742v1-abstract-short" style="display: inline;"> Federated Learning (FL) enables the training of Deep Learning models without centrally collecting possibly sensitive raw data. This paves the way for stronger privacy guarantees when building predictive models. The most used algorithms for FL are parameter-averaging based schemes (e.g., Federated Averaging) that, however, have well known limits: (i) Clients must implement the same model architectu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.04742v1-abstract-full').style.display = 'inline'; document.getElementById('2211.04742v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.04742v1-abstract-full" style="display: none;"> Federated Learning (FL) enables the training of Deep Learning models without centrally collecting possibly sensitive raw data. This paves the way for stronger privacy guarantees when building predictive models. The most used algorithms for FL are parameter-averaging based schemes (e.g., Federated Averaging) that, however, have well known limits: (i) Clients must implement the same model architecture; (ii) Transmitting model weights and model updates implies high communication cost, which scales up with the number of model parameters; (iii) In presence of non-IID data distributions, parameter-averaging aggregation schemes perform poorly due to client model drifts. Federated adaptations of regular Knowledge Distillation (KD) can solve and/or mitigate the weaknesses of parameter-averaging FL algorithms while possibly introducing other trade-offs. In this article, we provide a review of KD-based algorithms tailored for specific FL issues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.04742v1-abstract-full').style.display = 'none'; document.getElementById('2211.04742v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.14891">arXiv:2210.14891</a> <span> [<a href="https://arxiv.org/pdf/2210.14891">pdf</a>, <a href="https://arxiv.org/format/2210.14891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Broken Neural Scaling Laws </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Caballero%2C+E">Ethan Caballero</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+K">Kshitij Gupta</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Krueger%2C+D">David Krueger</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.14891v17-abstract-short" style="display: inline;"> We present a smoothly broken power law functional form (that we refer to as a Broken Neural Scaling Law (BNSL)) that accurately models & extrapolates the scaling behaviors of deep neural networks (i.e. how the evaluation metric of interest varies as amount of compute used for training (or inference), number of model parameters, training dataset size, model input size, number of training steps, or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14891v17-abstract-full').style.display = 'inline'; document.getElementById('2210.14891v17-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.14891v17-abstract-full" style="display: none;"> We present a smoothly broken power law functional form (that we refer to as a Broken Neural Scaling Law (BNSL)) that accurately models & extrapolates the scaling behaviors of deep neural networks (i.e. how the evaluation metric of interest varies as amount of compute used for training (or inference), number of model parameters, training dataset size, model input size, number of training steps, or upstream performance varies) for various architectures & for each of various tasks within a large & diverse set of upstream & downstream tasks, in zero-shot, prompted, & finetuned settings. This set includes large-scale vision, language, audio, video, diffusion, generative modeling, multimodal learning, contrastive learning, AI alignment, AI capabilities, robotics, out-of-distribution (OOD) generalization, continual learning, transfer learning, uncertainty estimation / calibration, OOD detection, adversarial robustness, distillation, sparsity, retrieval, quantization, pruning, fairness, molecules, computer programming/coding, math word problems, "emergent phase transitions", arithmetic, supervised learning, unsupervised/self-supervised learning, & reinforcement learning (single agent & multi-agent). When compared to other functional forms for neural scaling, this functional form yields extrapolations of scaling behavior that are considerably more accurate on this set. Moreover, this functional form accurately models & extrapolates scaling behavior that other functional forms are incapable of expressing such as the nonmonotonic transitions present in the scaling behavior of phenomena such as double descent & the delayed, sharp inflection points present in the scaling behavior of tasks such as arithmetic. Lastly, we use this functional form to glean insights about the limit of the predictability of scaling behavior. Code is available at https://github.com/ethancaballero/broken_neural_scaling_laws <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14891v17-abstract-full').style.display = 'none'; document.getElementById('2210.14891v17-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at International Conference on Learning Representations (ICLR) 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Conference on Learning Representations (ICLR), 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.14161">arXiv:2210.14161</a> <span> [<a href="https://arxiv.org/pdf/2210.14161">pdf</a>, <a href="https://arxiv.org/format/2210.14161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Aligning MAGMA by Few-Shot Learning and Finetuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Layoun%2C+J">Jean-Charles Layoun</a>, <a href="/search/cs?searchtype=author&query=Roger%2C+A">Alexis Roger</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.14161v1-abstract-short" style="display: inline;"> The goal of vision-language modeling is to allow models to tie language understanding with visual inputs. The aim of this paper is to evaluate and align the Visual Language Model (VLM) called Multimodal Augmentation of Generative Models through Adapter-based finetuning (MAGMA) with human values. MAGMA is a VLM that is capable of image captioning and visual question-answering. We will evaluate its… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14161v1-abstract-full').style.display = 'inline'; document.getElementById('2210.14161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.14161v1-abstract-full" style="display: none;"> The goal of vision-language modeling is to allow models to tie language understanding with visual inputs. The aim of this paper is to evaluate and align the Visual Language Model (VLM) called Multimodal Augmentation of Generative Models through Adapter-based finetuning (MAGMA) with human values. MAGMA is a VLM that is capable of image captioning and visual question-answering. We will evaluate its alignment in three different scenarios. To begin, we assess MAGMA's out-of-the-box alignment through the checkpoint provided by Hugging Face. Then, we measure if few-shot learning manages to improve the results. Finally, we finetune the model on aligned examples and evaluate its behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14161v1-abstract-full').style.display = 'none'; document.getElementById('2210.14161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the Montreal AI Symposium conference in 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.04121">arXiv:2210.04121</a> <span> [<a href="https://arxiv.org/pdf/2210.04121">pdf</a>, <a href="https://arxiv.org/format/2210.04121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Cognitive Models as Simulators: The Case of Moral Decision-Making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nobandegani%2C+A+S">Ardavan S. Nobandegani</a>, <a href="/search/cs?searchtype=author&query=Shultz%2C+T+R">Thomas R. Shultz</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.04121v1-abstract-short" style="display: inline;"> To achieve desirable performance, current AI systems often require huge amounts of training data. This is especially problematic in domains where collecting data is both expensive and time-consuming, e.g., where AI systems require having numerous interactions with humans, collecting feedback from them. In this work, we substantiate the idea of $\textit{cognitive models as simulators}$, which is to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.04121v1-abstract-full').style.display = 'inline'; document.getElementById('2210.04121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.04121v1-abstract-full" style="display: none;"> To achieve desirable performance, current AI systems often require huge amounts of training data. This is especially problematic in domains where collecting data is both expensive and time-consuming, e.g., where AI systems require having numerous interactions with humans, collecting feedback from them. In this work, we substantiate the idea of $\textit{cognitive models as simulators}$, which is to have AI systems interact with, and collect feedback from, cognitive models instead of humans, thereby making their training process both less costly and faster. Here, we leverage this idea in the context of moral decision-making, by having reinforcement learning (RL) agents learn about fairness through interacting with a cognitive model of the Ultimatum Game (UG), a canonical task in behavioral and brain sciences for studying fairness. Interestingly, these RL agents learn to rationally adapt their behavior depending on the emotional state of their simulated UG responder. Our work suggests that using cognitive models as simulators of humans is an effective approach for training AI systems, presenting an important way for computational cognitive science to make contributions to AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.04121v1-abstract-full').style.display = 'none'; document.getElementById('2210.04121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.03150">arXiv:2210.03150</a> <span> [<a href="https://arxiv.org/pdf/2210.03150">pdf</a>, <a href="https://arxiv.org/format/2210.03150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Out-of-Distribution Adversarial Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ibrahim%2C+A">Adam Ibrahim</a>, <a href="/search/cs?searchtype=author&query=Guille-Escuret%2C+C">Charles Guille-Escuret</a>, <a href="/search/cs?searchtype=author&query=Mitliagkas%2C+I">Ioannis Mitliagkas</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Krueger%2C+D">David Krueger</a>, <a href="/search/cs?searchtype=author&query=Bashivan%2C+P">Pouya Bashivan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.03150v4-abstract-short" style="display: inline;"> Adversarial robustness continues to be a major challenge for deep learning. A core issue is that robustness to one type of attack often fails to transfer to other attacks. While prior work establishes a theoretical trade-off in robustness against different $L_p$ norms, we show that there is potential for improvement against many commonly used attacks by adopting a domain generalisation approach. C… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03150v4-abstract-full').style.display = 'inline'; document.getElementById('2210.03150v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.03150v4-abstract-full" style="display: none;"> Adversarial robustness continues to be a major challenge for deep learning. A core issue is that robustness to one type of attack often fails to transfer to other attacks. While prior work establishes a theoretical trade-off in robustness against different $L_p$ norms, we show that there is potential for improvement against many commonly used attacks by adopting a domain generalisation approach. Concretely, we treat each type of attack as a domain, and apply the Risk Extrapolation method (REx), which promotes similar levels of robustness against all training attacks. Compared to existing methods, we obtain similar or superior worst-case adversarial robustness on attacks seen during training. Moreover, we achieve superior performance on families or tunings of attacks only encountered at test time. On ensembles of attacks, our approach improves the accuracy from 3.4% with the best existing baseline to 25.9% on MNIST, and from 16.9% to 23.5% on CIFAR10. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03150v4-abstract-full').style.display = 'none'; document.getElementById('2210.03150v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Version of NeurIPS 2023 submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.04543">arXiv:2207.04543</a> <span> [<a href="https://arxiv.org/pdf/2207.04543">pdf</a>, <a href="https://arxiv.org/format/2207.04543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Challenging Common Assumptions about Catastrophic Forgetting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lesort%2C+T">Timoth茅e Lesort</a>, <a href="/search/cs?searchtype=author&query=Ostapenko%2C+O">Oleksiy Ostapenko</a>, <a href="/search/cs?searchtype=author&query=Misra%2C+D">Diganta Misra</a>, <a href="/search/cs?searchtype=author&query=Arefin%2C+M+R">Md Rifat Arefin</a>, <a href="/search/cs?searchtype=author&query=Rodr%C3%ADguez%2C+P">Pau Rodr铆guez</a>, <a href="/search/cs?searchtype=author&query=Charlin%2C+L">Laurent Charlin</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.04543v2-abstract-short" style="display: inline;"> Building learning agents that can progressively learn and accumulate knowledge is the core goal of the continual learning (CL) research field. Unfortunately, training a model on new data usually compromises the performance on past data. In the CL literature, this effect is referred to as catastrophic forgetting (CF). CF has been largely studied, and a plethora of methods have been proposed to addr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.04543v2-abstract-full').style.display = 'inline'; document.getElementById('2207.04543v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.04543v2-abstract-full" style="display: none;"> Building learning agents that can progressively learn and accumulate knowledge is the core goal of the continual learning (CL) research field. Unfortunately, training a model on new data usually compromises the performance on past data. In the CL literature, this effect is referred to as catastrophic forgetting (CF). CF has been largely studied, and a plethora of methods have been proposed to address it on short sequences of non-overlapping tasks. In such setups, CF always leads to a quick and significant drop in performance in past tasks. Nevertheless, despite CF, recent work showed that SGD training on linear models accumulates knowledge in a CL regression setup. This phenomenon becomes especially visible when tasks reoccur. We might then wonder if DNNs trained with SGD or any standard gradient-based optimization accumulate knowledge in such a way. Such phenomena would have interesting consequences for applying DNNs to real continual scenarios. Indeed, standard gradient-based optimization methods are significantly less computationally expensive than existing CL algorithms. In this paper, we study the progressive knowledge accumulation (KA) in DNNs trained with gradient-based algorithms in long sequences of tasks with data re-occurrence. We propose a new framework, SCoLe (Scaling Continual Learning), to investigate KA and discover that catastrophic forgetting has a limited effect on DNNs trained with SGD. When trained on long sequences with data sparsely re-occurring, the overall accuracy improves, which might be counter-intuitive given the CF phenomenon. We empirically investigate KA in DNNs under various data occurrence frequencies and propose simple and scalable strategies to increase knowledge accumulation in DNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.04543v2-abstract-full').style.display = 'none'; document.getElementById('2207.04543v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.00329">arXiv:2205.00329</a> <span> [<a href="https://arxiv.org/pdf/2205.00329">pdf</a>, <a href="https://arxiv.org/format/2205.00329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Continual Learning with Foundation Models: An Empirical Study of Latent Replay </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ostapenko%2C+O">Oleksiy Ostapenko</a>, <a href="/search/cs?searchtype=author&query=Lesort%2C+T">Timothee Lesort</a>, <a href="/search/cs?searchtype=author&query=Rodr%C3%ADguez%2C+P">Pau Rodr铆guez</a>, <a href="/search/cs?searchtype=author&query=Arefin%2C+M+R">Md Rifat Arefin</a>, <a href="/search/cs?searchtype=author&query=Douillard%2C+A">Arthur Douillard</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Charlin%2C+L">Laurent Charlin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.00329v2-abstract-short" style="display: inline;"> Rapid development of large-scale pre-training has resulted in foundation models that can act as effective feature extractors on a variety of downstream tasks and domains. Motivated by this, we study the efficacy of pre-trained vision models as a foundation for downstream continual learning (CL) scenarios. Our goal is twofold. First, we want to understand the compute-accuracy trade-off between CL i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.00329v2-abstract-full').style.display = 'inline'; document.getElementById('2205.00329v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.00329v2-abstract-full" style="display: none;"> Rapid development of large-scale pre-training has resulted in foundation models that can act as effective feature extractors on a variety of downstream tasks and domains. Motivated by this, we study the efficacy of pre-trained vision models as a foundation for downstream continual learning (CL) scenarios. Our goal is twofold. First, we want to understand the compute-accuracy trade-off between CL in the raw-data space and in the latent space of pre-trained encoders. Second, we investigate how the characteristics of the encoder, the pre-training algorithm and data, as well as of the resulting latent space affect CL performance. For this, we compare the efficacy of various pre-trained models in large-scale benchmarking scenarios with a vanilla replay setting applied in the latent and in the raw-data space. Notably, this study shows how transfer, forgetting, task similarity and learning are dependent on the input data characteristics and not necessarily on the CL algorithms. First, we show that under some circumstances reasonable CL performance can readily be achieved with a non-parametric classifier at negligible compute. We then show how models pre-trained on broader data result in better performance for various replay sizes. We explain this with representational similarity and transfer properties of these representations. Finally, we show the effectiveness of self-supervised pre-training for downstream domains that are out-of-distribution as compared to the pre-training domain. We point out and validate several research directions that can further increase the efficacy of latent CL including representation ensembling. The diverse set of datasets used in this study can serve as a compute-efficient playground for further CL research. The codebase is available under https://github.com/oleksost/latent_CL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.00329v2-abstract-full').style.display = 'none'; document.getElementById('2205.00329v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.01640">arXiv:2204.01640</a> <span> [<a href="https://arxiv.org/pdf/2204.01640">pdf</a>, <a href="https://arxiv.org/format/2204.01640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> APP: Anytime Progressive Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Misra%2C+D">Diganta Misra</a>, <a href="/search/cs?searchtype=author&query=Runwal%2C+B">Bharat Runwal</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhangyang Wang</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.01640v2-abstract-short" style="display: inline;"> With the latest advances in deep learning, there has been a lot of focus on the online learning paradigm due to its relevance in practical settings. Although many methods have been investigated for optimal learning settings in scenarios where the data stream is continuous over time, sparse networks training in such settings have often been overlooked. In this paper, we explore the problem of train… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.01640v2-abstract-full').style.display = 'inline'; document.getElementById('2204.01640v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.01640v2-abstract-full" style="display: none;"> With the latest advances in deep learning, there has been a lot of focus on the online learning paradigm due to its relevance in practical settings. Although many methods have been investigated for optimal learning settings in scenarios where the data stream is continuous over time, sparse networks training in such settings have often been overlooked. In this paper, we explore the problem of training a neural network with a target sparsity in a particular case of online learning: the anytime learning at macroscale paradigm (ALMA). We propose a novel way of progressive pruning, referred to as \textit{Anytime Progressive Pruning} (APP); the proposed approach significantly outperforms the baseline dense and Anytime OSP models across multiple architectures and datasets under short, moderate, and long-sequence training. Our method, for example, shows an improvement in accuracy of $\approx 7\%$ and a reduction in the generalization gap by $\approx 22\%$, while being $\approx 1/3$ rd the size of the dense baseline model in few-shot restricted imagenet training. We further observe interesting nonmonotonic transitions in the generalization gap in the high number of megabatches-based ALMA. The code and experiment dashboards can be accessed at \url{https://github.com/landskape-ai/Progressive-Pruning} and \url{https://wandb.ai/landskape/APP}, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.01640v2-abstract-full').style.display = 'none'; document.getElementById('2204.01640v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages including 4 pages of references. Preprint version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.09978">arXiv:2203.09978</a> <span> [<a href="https://arxiv.org/pdf/2203.09978">pdf</a>, <a href="https://arxiv.org/format/2203.09978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> WOODS: Benchmarks for Out-of-Distribution Generalization in Time Series </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gagnon-Audet%2C+J">Jean-Christophe Gagnon-Audet</a>, <a href="/search/cs?searchtype=author&query=Ahuja%2C+K">Kartik Ahuja</a>, <a href="/search/cs?searchtype=author&query=Darvishi-Bayazi%2C+M">Mohammad-Javad Darvishi-Bayazi</a>, <a href="/search/cs?searchtype=author&query=Mousavi%2C+P">Pooneh Mousavi</a>, <a href="/search/cs?searchtype=author&query=Dumas%2C+G">Guillaume Dumas</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.09978v2-abstract-short" style="display: inline;"> Machine learning models often fail to generalize well under distributional shifts. Understanding and overcoming these failures have led to a research field of Out-of-Distribution (OOD) generalization. Despite being extensively studied for static computer vision tasks, OOD generalization has been underexplored for time series tasks. To shine light on this gap, we present WOODS: eight challenging op… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.09978v2-abstract-full').style.display = 'inline'; document.getElementById('2203.09978v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.09978v2-abstract-full" style="display: none;"> Machine learning models often fail to generalize well under distributional shifts. Understanding and overcoming these failures have led to a research field of Out-of-Distribution (OOD) generalization. Despite being extensively studied for static computer vision tasks, OOD generalization has been underexplored for time series tasks. To shine light on this gap, we present WOODS: eight challenging open-source time series benchmarks covering a diverse range of data modalities, such as videos, brain recordings, and sensor signals. We revise the existing OOD generalization algorithms for time series tasks and evaluate them using our systematic framework. Our experiments show a large room for improvement for empirical risk minimization and OOD generalization algorithms on our datasets, thus underscoring the new challenges posed by time series tasks. Code and documentation are available at https://woods-benchmarks.github.io . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.09978v2-abstract-full').style.display = 'none'; document.getElementById('2203.09978v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">47 pages, 21 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.13415">arXiv:2201.13415</a> <span> [<a href="https://arxiv.org/pdf/2201.13415">pdf</a>, <a href="https://arxiv.org/format/2201.13415">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Towards Scaling Difference Target Propagation by Learning Backprop Targets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ernoult%2C+M">Maxence Ernoult</a>, <a href="/search/cs?searchtype=author&query=Normandin%2C+F">Fabrice Normandin</a>, <a href="/search/cs?searchtype=author&query=Moudgil%2C+A">Abhinav Moudgil</a>, <a href="/search/cs?searchtype=author&query=Spinney%2C+S">Sean Spinney</a>, <a href="/search/cs?searchtype=author&query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Richards%2C+B">Blake Richards</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.13415v1-abstract-short" style="display: inline;"> The development of biologically-plausible learning algorithms is important for understanding learning in the brain, but most of them fail to scale-up to real-world tasks, limiting their potential as explanations for learning by real brains. As such, it is important to explore learning algorithms that come with strong theoretical guarantees and can match the performance of backpropagation (BP) on c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.13415v1-abstract-full').style.display = 'inline'; document.getElementById('2201.13415v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.13415v1-abstract-full" style="display: none;"> The development of biologically-plausible learning algorithms is important for understanding learning in the brain, but most of them fail to scale-up to real-world tasks, limiting their potential as explanations for learning by real brains. As such, it is important to explore learning algorithms that come with strong theoretical guarantees and can match the performance of backpropagation (BP) on complex tasks. One such algorithm is Difference Target Propagation (DTP), a biologically-plausible learning algorithm whose close relation with Gauss-Newton (GN) optimization has been recently established. However, the conditions under which this connection rigorously holds preclude layer-wise training of the feedback pathway synaptic weights (which is more biologically plausible). Moreover, good alignment between DTP weight updates and loss gradients is only loosely guaranteed and under very specific conditions for the architecture being trained. In this paper, we propose a novel feedback weight training scheme that ensures both that DTP approximates BP and that layer-wise feedback weight training can be restored without sacrificing any theoretical guarantees. Our theory is corroborated by experimental results and we report the best performance ever achieved by DTP on CIFAR-10 and ImageNet 32$\times$32 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.13415v1-abstract-full').style.display = 'none'; document.getElementById('2201.13415v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.11986">arXiv:2201.11986</a> <span> [<a href="https://arxiv.org/pdf/2201.11986">pdf</a>, <a href="https://arxiv.org/format/2201.11986">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gradient Masked Averaging for Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tenison%2C+I">Irene Tenison</a>, <a href="/search/cs?searchtype=author&query=Sreeramadas%2C+S+A">Sai Aravind Sreeramadas</a>, <a href="/search/cs?searchtype=author&query=Mugunthan%2C+V">Vaikkunth Mugunthan</a>, <a href="/search/cs?searchtype=author&query=Oyallon%2C+E">Edouard Oyallon</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.11986v2-abstract-short" style="display: inline;"> Federated learning (FL) is an emerging paradigm that permits a large number of clients with heterogeneous data to coordinate learning of a unified global model without the need to share data amongst each other. A major challenge in federated learning is the heterogeneity of data across client, which can degrade the performance of standard FL algorithms. Standard FL algorithms involve averaging of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11986v2-abstract-full').style.display = 'inline'; document.getElementById('2201.11986v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.11986v2-abstract-full" style="display: none;"> Federated learning (FL) is an emerging paradigm that permits a large number of clients with heterogeneous data to coordinate learning of a unified global model without the need to share data amongst each other. A major challenge in federated learning is the heterogeneity of data across client, which can degrade the performance of standard FL algorithms. Standard FL algorithms involve averaging of model parameters or gradient updates to approximate the global model at the server. However, we argue that in heterogeneous settings, averaging can result in information loss and lead to poor generalization due to the bias induced by dominant client gradients. We hypothesize that to generalize better across non-i.i.d datasets, the algorithms should focus on learning the invariant mechanism that is constant while ignoring spurious mechanisms that differ across clients. Inspired from recent works in Out-of-Distribution generalization, we propose a gradient masked averaging approach for FL as an alternative to the standard averaging of client updates. This aggregation technique for client updates can be adapted as a drop-in replacement in most existing federated algorithms. We perform extensive experiments on multiple FL algorithms with in-distribution, real-world, feature-skewed out-of-distribution, and quantity imbalanced datasets and show that it provides consistent improvements, particularly in the case of heterogeneous clients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11986v2-abstract-full').style.display = 'none'; document.getElementById('2201.11986v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.07066">arXiv:2112.07066</a> <span> [<a href="https://arxiv.org/pdf/2112.07066">pdf</a>, <a href="https://arxiv.org/format/2112.07066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Continual Learning In Environments With Polynomial Mixing Times </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Riemer%2C+M">Matthew Riemer</a>, <a href="/search/cs?searchtype=author&query=Raparthy%2C+S+C">Sharath Chandra Raparthy</a>, <a href="/search/cs?searchtype=author&query=Cases%2C+I">Ignacio Cases</a>, <a href="/search/cs?searchtype=author&query=Subbaraj%2C+G">Gopeshh Subbaraj</a>, <a href="/search/cs?searchtype=author&query=Touzel%2C+M+P">Maximilian Puelma Touzel</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.07066v2-abstract-short" style="display: inline;"> The mixing time of the Markov chain induced by a policy limits performance in real-world continual learning scenarios. Yet, the effect of mixing times on learning in continual reinforcement learning (RL) remains underexplored. In this paper, we characterize problems that are of long-term interest to the development of continual RL, which we call scalable MDPs, through the lens of mixing times. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.07066v2-abstract-full').style.display = 'inline'; document.getElementById('2112.07066v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.07066v2-abstract-full" style="display: none;"> The mixing time of the Markov chain induced by a policy limits performance in real-world continual learning scenarios. Yet, the effect of mixing times on learning in continual reinforcement learning (RL) remains underexplored. In this paper, we characterize problems that are of long-term interest to the development of continual RL, which we call scalable MDPs, through the lens of mixing times. In particular, we theoretically establish that scalable MDPs have mixing times that scale polynomially with the size of the problem. We go on to demonstrate that polynomial mixing times present significant difficulties for existing approaches, which suffer from myopic bias and stale bootstrapped estimates. To validate our theory, we study the empirical scaling behavior of mixing times with respect to the number of tasks and task duration for high performing policies deployed across multiple Atari games. Our analysis demonstrates both that polynomial mixing times do emerge in practice and how their existence may lead to unstable learning behavior like catastrophic forgetting in continual learning settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.07066v2-abstract-full').style.display = 'none'; document.getElementById('2112.07066v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.09419">arXiv:2110.09419</a> <span> [<a href="https://arxiv.org/pdf/2110.09419">pdf</a>, <a href="https://arxiv.org/format/2110.09419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Compositional Attention: Disentangling Search and Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mittal%2C+S">Sarthak Mittal</a>, <a href="/search/cs?searchtype=author&query=Raparthy%2C+S+C">Sharath Chandra Raparthy</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/cs?searchtype=author&query=Lajoie%2C+G">Guillaume Lajoie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.09419v2-abstract-short" style="display: inline;"> Multi-head, key-value attention is the backbone of the widely successful Transformer model and its variants. This attention mechanism uses multiple parallel key-value attention blocks (called heads), each performing two fundamental computations: (1) search - selection of a relevant entity from a set via query-key interactions, and (2) retrieval - extraction of relevant features from the selected e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.09419v2-abstract-full').style.display = 'inline'; document.getElementById('2110.09419v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.09419v2-abstract-full" style="display: none;"> Multi-head, key-value attention is the backbone of the widely successful Transformer model and its variants. This attention mechanism uses multiple parallel key-value attention blocks (called heads), each performing two fundamental computations: (1) search - selection of a relevant entity from a set via query-key interactions, and (2) retrieval - extraction of relevant features from the selected entity via a value matrix. Importantly, standard attention heads learn a rigid mapping between search and retrieval. In this work, we first highlight how this static nature of the pairing can potentially: (a) lead to learning of redundant parameters in certain tasks, and (b) hinder generalization. To alleviate this problem, we propose a novel attention mechanism, called Compositional Attention, that replaces the standard head structure. The proposed mechanism disentangles search and retrieval and composes them in a dynamic, flexible and context-dependent manner through an additional soft competition stage between the query-key combination and value pairing. Through a series of numerical experiments, we show that it outperforms standard multi-head attention on a variety of tasks, including some out-of-distribution settings. Through our qualitative analysis, we demonstrate that Compositional Attention leads to dynamic specialization based on the type of retrieval needed. Our proposed mechanism generalizes multi-head attention, allows independent scaling of search and retrieval, and can easily be implemented in lieu of standard attention heads in any network architecture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.09419v2-abstract-full').style.display = 'none'; document.getElementById('2110.09419v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.06990">arXiv:2110.06990</a> <span> [<a href="https://arxiv.org/pdf/2110.06990">pdf</a>, <a href="https://arxiv.org/format/2110.06990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Scaling Laws for the Few-Shot Adaptation of Pre-trained Image Classifiers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Prato%2C+G">Gabriele Prato</a>, <a href="/search/cs?searchtype=author&query=Guiroy%2C+S">Simon Guiroy</a>, <a href="/search/cs?searchtype=author&query=Caballero%2C+E">Ethan Caballero</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Chandar%2C+S">Sarath Chandar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.06990v2-abstract-short" style="display: inline;"> Empirical science of neural scaling laws is a rapidly growing area of significant importance to the future of machine learning, particularly in the light of recent breakthroughs achieved by large-scale pre-trained models such as GPT-3, CLIP and DALL-e. Accurately predicting the neural network performance with increasing resources such as data, compute and model size provides a more comprehensive e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.06990v2-abstract-full').style.display = 'inline'; document.getElementById('2110.06990v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.06990v2-abstract-full" style="display: none;"> Empirical science of neural scaling laws is a rapidly growing area of significant importance to the future of machine learning, particularly in the light of recent breakthroughs achieved by large-scale pre-trained models such as GPT-3, CLIP and DALL-e. Accurately predicting the neural network performance with increasing resources such as data, compute and model size provides a more comprehensive evaluation of different approaches across multiple scales, as opposed to traditional point-wise comparisons of fixed-size models on fixed-size benchmarks, and, most importantly, allows for focus on the best-scaling, and thus most promising in the future, approaches. In this work, we consider a challenging problem of few-shot learning in image classification, especially when the target data distribution in the few-shot phase is different from the source, training, data distribution, in a sense that it includes new image classes not encountered during training. Our current main goal is to investigate how the amount of pre-training data affects the few-shot generalization performance of standard image classifiers. Our key observations are that (1) such performance improvements are well-approximated by power laws (linear log-log plots) as the training set size increases, (2) this applies to both cases of target data coming from either the same or from a different domain (i.e., new classes) as the training data, and (3) few-shot performance on new classes converges at a faster rate than the standard classification performance on previously seen classes. Our findings shed new light on the relationship between scale and generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.06990v2-abstract-full').style.display = 'none'; document.getElementById('2110.06990v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.12461">arXiv:2108.12461</a> <span> [<a href="https://arxiv.org/pdf/2108.12461">pdf</a>, <a href="https://arxiv.org/format/2108.12461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Approximate Bayesian Optimisation for Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hassen%2C+N">Nadhir Hassen</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.12461v2-abstract-short" style="display: inline;"> A body of work has been done to automate machine learning algorithm to highlight the importance of model choice. Automating the process of choosing the best forecasting model and its corresponding parameters can result to improve a wide range of real-world applications. Bayesian optimisation (BO) uses a blackbox optimisation methods to propose solutions according to an exploration-exploitation tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.12461v2-abstract-full').style.display = 'inline'; document.getElementById('2108.12461v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.12461v2-abstract-full" style="display: none;"> A body of work has been done to automate machine learning algorithm to highlight the importance of model choice. Automating the process of choosing the best forecasting model and its corresponding parameters can result to improve a wide range of real-world applications. Bayesian optimisation (BO) uses a blackbox optimisation methods to propose solutions according to an exploration-exploitation trade-off criterion through acquisition functions. BO framework imposes two key ingredients: a probabilistic surrogate model that consist of prior belief of the unknown objective function(data-dependant) and an objective function that describes how optimal is the model-fit. Choosing the best model and its associated hyperparameters can be very expensive, and is typically fit using Gaussian processes (GPs) and at some extends applying approximate inference due its intractability. However, since GPs scale cubically with the number of observations, it has been challenging to handle objectives whose optimization requires many evaluations. In addition, most real-dataset are non-stationary which make idealistic assumptions on surrogate models. The necessity to solve the analytical tractability and the computational feasibility in a stochastic fashion enables to ensure the efficiency and the applicability of Bayesian optimisation. In this paper we explore the use of neural networks as an alternative to GPs to model distributions over functions, we provide a link between density-ratio estimation and class probability estimation based on approximate inference, this reformulation provides algorithm efficiency and tractability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.12461v2-abstract-full').style.display = 'none'; document.getElementById('2108.12461v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages with 4 pages supplementary materials</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.01005">arXiv:2108.01005</a> <span> [<a href="https://arxiv.org/pdf/2108.01005">pdf</a>, <a href="https://arxiv.org/format/2108.01005">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sequoia: A Software Framework to Unify Continual Learning Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Normandin%2C+F">Fabrice Normandin</a>, <a href="/search/cs?searchtype=author&query=Golemo%2C+F">Florian Golemo</a>, <a href="/search/cs?searchtype=author&query=Ostapenko%2C+O">Oleksiy Ostapenko</a>, <a href="/search/cs?searchtype=author&query=Rodriguez%2C+P">Pau Rodriguez</a>, <a href="/search/cs?searchtype=author&query=Riemer%2C+M+D">Matthew D Riemer</a>, <a href="/search/cs?searchtype=author&query=Hurtado%2C+J">Julio Hurtado</a>, <a href="/search/cs?searchtype=author&query=Khetarpal%2C+K">Khimya Khetarpal</a>, <a href="/search/cs?searchtype=author&query=Lindeborg%2C+R">Ryan Lindeborg</a>, <a href="/search/cs?searchtype=author&query=Cecchi%2C+L">Lucas Cecchi</a>, <a href="/search/cs?searchtype=author&query=Lesort%2C+T">Timoth茅e Lesort</a>, <a href="/search/cs?searchtype=author&query=Charlin%2C+L">Laurent Charlin</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Caccia%2C+M">Massimo Caccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.01005v4-abstract-short" style="display: inline;"> The field of Continual Learning (CL) seeks to develop algorithms that accumulate knowledge and skills over time through interaction with non-stationary environments. In practice, a plethora of evaluation procedures (settings) and algorithmic solutions (methods) exist, each with their own potentially disjoint set of assumptions. This variety makes measuring progress in CL difficult. We propose a ta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.01005v4-abstract-full').style.display = 'inline'; document.getElementById('2108.01005v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.01005v4-abstract-full" style="display: none;"> The field of Continual Learning (CL) seeks to develop algorithms that accumulate knowledge and skills over time through interaction with non-stationary environments. In practice, a plethora of evaluation procedures (settings) and algorithmic solutions (methods) exist, each with their own potentially disjoint set of assumptions. This variety makes measuring progress in CL difficult. We propose a taxonomy of settings, where each setting is described as a set of assumptions. A tree-shaped hierarchy emerges from this view, where more general settings become the parents of those with more restrictive assumptions. This makes it possible to use inheritance to share and reuse research, as developing a method for a given setting also makes it directly applicable onto any of its children. We instantiate this idea as a publicly available software framework called Sequoia, which features a wide variety of settings from both the Continual Supervised Learning (CSL) and Continual Reinforcement Learning (CRL) domains. Sequoia also includes a growing suite of methods which are easy to extend and customize, in addition to more specialized methods from external libraries. We hope that this new paradigm and its first implementation can help unify and accelerate research in CL. You can help us grow the tree by visiting www.github.com/lebrice/Sequoia. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.01005v4-abstract-full').style.display = 'none'; document.getElementById('2108.01005v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.09539">arXiv:2107.09539</a> <span> [<a href="https://arxiv.org/pdf/2107.09539">pdf</a>, <a href="https://arxiv.org/format/2107.09539">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Parametric Scattering Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gauthier%2C+S">Shanel Gauthier</a>, <a href="/search/cs?searchtype=author&query=Th%C3%A9rien%2C+B">Benjamin Th茅rien</a>, <a href="/search/cs?searchtype=author&query=Als%C3%A8ne-Racicot%2C+L">Laurent Als猫ne-Racicot</a>, <a href="/search/cs?searchtype=author&query=Chaudhary%2C+M">Muawiz Chaudhary</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&query=Eickenberg%2C+M">Michael Eickenberg</a>, <a href="/search/cs?searchtype=author&query=Wolf%2C+G">Guy Wolf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.09539v4-abstract-short" style="display: inline;"> The wavelet scattering transform creates geometric invariants and deformation stability. In multiple signal domains, it has been shown to yield more discriminative representations compared to other non-learned representations and to outperform learned representations in certain tasks, particularly on limited labeled data and highly structured signals. The wavelet filters used in the scattering tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.09539v4-abstract-full').style.display = 'inline'; document.getElementById('2107.09539v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.09539v4-abstract-full" style="display: none;"> The wavelet scattering transform creates geometric invariants and deformation stability. In multiple signal domains, it has been shown to yield more discriminative representations compared to other non-learned representations and to outperform learned representations in certain tasks, particularly on limited labeled data and highly structured signals. The wavelet filters used in the scattering transform are typically selected to create a tight frame via a parameterized mother wavelet. In this work, we investigate whether this standard wavelet filterbank construction is optimal. Focusing on Morlet wavelets, we propose to learn the scales, orientations, and aspect ratios of the filters to produce problem-specific parameterizations of the scattering transform. We show that our learned versions of the scattering transform yield significant performance gains in small-sample classification settings over the standard scattering transform. Moreover, our empirical results suggest that traditional filterbank constructions may not always be necessary for scattering transforms to extract effective representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.09539v4-abstract-full').style.display = 'none'; document.getElementById('2107.09539v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> F.2.2; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06607">arXiv:2106.06607</a> <span> [<a href="https://arxiv.org/pdf/2106.06607">pdf</a>, <a href="https://arxiv.org/format/2106.06607">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Invariance Principle Meets Information Bottleneck for Out-of-Distribution Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ahuja%2C+K">Kartik Ahuja</a>, <a href="/search/cs?searchtype=author&query=Caballero%2C+E">Ethan Caballero</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/cs?searchtype=author&query=Gagnon-Audet%2C+J">Jean-Christophe Gagnon-Audet</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/cs?searchtype=author&query=Mitliagkas%2C+I">Ioannis Mitliagkas</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06607v2-abstract-short" style="display: inline;"> The invariance principle from causality is at the heart of notable approaches such as invariant risk minimization (IRM) that seek to address out-of-distribution (OOD) generalization failures. Despite the promising theory, invariance principle-based approaches fail in common classification tasks, where invariant (causal) features capture all the information about the label. Are these failures due t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06607v2-abstract-full').style.display = 'inline'; document.getElementById('2106.06607v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06607v2-abstract-full" style="display: none;"> The invariance principle from causality is at the heart of notable approaches such as invariant risk minimization (IRM) that seek to address out-of-distribution (OOD) generalization failures. Despite the promising theory, invariance principle-based approaches fail in common classification tasks, where invariant (causal) features capture all the information about the label. Are these failures due to the methods failing to capture the invariance? Or is the invariance principle itself insufficient? To answer these questions, we revisit the fundamental assumptions in linear regression tasks, where invariance-based approaches were shown to provably generalize OOD. In contrast to the linear regression tasks, we show that for linear classification tasks we need much stronger restrictions on the distribution shifts, or otherwise OOD generalization is impossible. Furthermore, even with appropriate restrictions on distribution shifts in place, we show that the invariance principle alone is insufficient. We prove that a form of the information bottleneck constraint along with invariance helps address key failures when invariant features capture all the information about the label and also retains the existing success when they do not. We propose an approach that incorporates both of these principles and demonstrate its effectiveness in several experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06607v2-abstract-full').style.display = 'none'; document.getElementById('2106.06607v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.02266">arXiv:2106.02266</a> <span> [<a href="https://arxiv.org/pdf/2106.02266">pdf</a>, <a href="https://arxiv.org/format/2106.02266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SAND-mask: An Enhanced Gradient Masking Strategy for the Discovery of Invariances in Domain Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shahtalebi%2C+S">Soroosh Shahtalebi</a>, <a href="/search/cs?searchtype=author&query=Gagnon-Audet%2C+J">Jean-Christophe Gagnon-Audet</a>, <a href="/search/cs?searchtype=author&query=Laleh%2C+T">Touraj Laleh</a>, <a href="/search/cs?searchtype=author&query=Faramarzi%2C+M">Mojtaba Faramarzi</a>, <a href="/search/cs?searchtype=author&query=Ahuja%2C+K">Kartik Ahuja</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.02266v2-abstract-short" style="display: inline;"> A major bottleneck in the real-world applications of machine learning models is their failure in generalizing to unseen domains whose data distribution is not i.i.d to the training domains. This failure often stems from learning non-generalizable features in the training domains that are spuriously correlated with the label of data. To address this shortcoming, there has been a growing surge of in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.02266v2-abstract-full').style.display = 'inline'; document.getElementById('2106.02266v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.02266v2-abstract-full" style="display: none;"> A major bottleneck in the real-world applications of machine learning models is their failure in generalizing to unseen domains whose data distribution is not i.i.d to the training domains. This failure often stems from learning non-generalizable features in the training domains that are spuriously correlated with the label of data. To address this shortcoming, there has been a growing surge of interest in learning good explanations that are hard to vary, which is studied under the notion of Out-of-Distribution (OOD) Generalization. The search for good explanations that are \textit{invariant} across different domains can be seen as finding local (global) minimas in the loss landscape that hold true across all of the training domains. In this paper, we propose a masking strategy, which determines a continuous weight based on the agreement of gradients that flow in each edge of network, in order to control the amount of update received by the edge in each step of optimization. Particularly, our proposed technique referred to as "Smoothed-AND (SAND)-masking", not only validates the agreement in the direction of gradients but also promotes the agreement among their magnitudes to further ensure the discovery of invariances across training domains. SAND-mask is validated over the Domainbed benchmark for domain generalization and significantly improves the state-of-the-art accuracy on the Colored MNIST dataset while providing competitive results on other domain generalization datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.02266v2-abstract-full').style.display = 'none'; document.getElementById('2106.02266v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.01834">arXiv:2106.01834</a> <span> [<a href="https://arxiv.org/pdf/2106.01834">pdf</a>, <a href="https://arxiv.org/format/2106.01834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Continual Learning in Deep Networks: an Analysis of the Last Layer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lesort%2C+T">Timoth茅e Lesort</a>, <a href="/search/cs?searchtype=author&query=George%2C+T">Thomas George</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.01834v3-abstract-short" style="display: inline;"> We study how different output layer parameterizations of a deep neural network affects learning and forgetting in continual learning settings. The following three effects can cause catastrophic forgetting in the output layer: (1) weights modifications, (2) interference, and (3) projection drift. In this paper, our goal is to provide more insights into how changing the output layer parameterization… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.01834v3-abstract-full').style.display = 'inline'; document.getElementById('2106.01834v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.01834v3-abstract-full" style="display: none;"> We study how different output layer parameterizations of a deep neural network affects learning and forgetting in continual learning settings. The following three effects can cause catastrophic forgetting in the output layer: (1) weights modifications, (2) interference, and (3) projection drift. In this paper, our goal is to provide more insights into how changing the output layer parameterization may address (1) and (2). Some potential solutions to those issues are proposed and evaluated here in several continual learning scenarios. We show that the best-performing type of output layer depends on the data distribution drifts and/or the amount of data available. In particular, in some cases where a standard linear layer would fail, changing parameterization is sufficient to achieve a significantly better performance, without introducing any continual-learning algorithm but instead by using standard SGD to train a model. Our analysis and results shed light on the dynamics of the output layer in continual learning scenarios and suggest a way of selecting the best type of output layer for a given scenario. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.01834v3-abstract-full').style.display = 'none'; document.getElementById('2106.01834v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.10322">arXiv:2104.10322</a> <span> [<a href="https://arxiv.org/pdf/2104.10322">pdf</a>, <a href="https://arxiv.org/format/2104.10322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Gradient Masked Federated Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tenison%2C+I">Irene Tenison</a>, <a href="/search/cs?searchtype=author&query=Francis%2C+S">Sreya Francis</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.10322v1-abstract-short" style="display: inline;"> Federated Averaging (FedAVG) has become the most popular federated learning algorithm due to its simplicity and low communication overhead. We use simple examples to show that FedAVG has the tendency to sew together the optima across the participating clients. These sewed optima exhibit poor generalization when used on a new client with new data distribution. Inspired by the invariance principles… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.10322v1-abstract-full').style.display = 'inline'; document.getElementById('2104.10322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.10322v1-abstract-full" style="display: none;"> Federated Averaging (FedAVG) has become the most popular federated learning algorithm due to its simplicity and low communication overhead. We use simple examples to show that FedAVG has the tendency to sew together the optima across the participating clients. These sewed optima exhibit poor generalization when used on a new client with new data distribution. Inspired by the invariance principles in (Arjovsky et al., 2019; Parascandolo et al., 2020), we focus on learning a model that is locally optimal across the different clients simultaneously. We propose a modification to FedAVG algorithm to include masked gradients (AND-mask from (Parascandolo et al., 2020)) across the clients and uses them to carry out an additional server model update. We show that this algorithm achieves better accuracy (out-of-distribution) than FedAVG, especially when the data is non-identically distributed across clients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.10322v1-abstract-full').style.display = 'none'; document.getElementById('2104.10322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICLR 2021 Distributed and Private Machine Learning(DPML) Workshop </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.06557">arXiv:2104.06557</a> <span> [<a href="https://arxiv.org/pdf/2104.06557">pdf</a>, <a href="https://arxiv.org/format/2104.06557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Towards Causal Federated Learning For Enhanced Robustness and Privacy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Francis%2C+S">Sreya Francis</a>, <a href="/search/cs?searchtype=author&query=Tenison%2C+I">Irene Tenison</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.06557v1-abstract-short" style="display: inline;"> Federated Learning is an emerging privacy-preserving distributed machine learning approach to building a shared model by performing distributed training locally on participating devices (clients) and aggregating the local models into a global one. As this approach prevents data collection and aggregation, it helps in reducing associated privacy risks to a great extent. However, the data samples ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.06557v1-abstract-full').style.display = 'inline'; document.getElementById('2104.06557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.06557v1-abstract-full" style="display: none;"> Federated Learning is an emerging privacy-preserving distributed machine learning approach to building a shared model by performing distributed training locally on participating devices (clients) and aggregating the local models into a global one. As this approach prevents data collection and aggregation, it helps in reducing associated privacy risks to a great extent. However, the data samples across all participating clients are usually not independent and identically distributed (non-iid), and Out of Distribution(OOD) generalization for the learned models can be poor. Besides this challenge, federated learning also remains vulnerable to various attacks on security wherein a few malicious participating entities work towards inserting backdoors, degrading the generated aggregated model as well as inferring the data owned by participating entities. In this paper, we propose an approach for learning invariant (causal) features common to all participating clients in a federated learning setup and analyze empirically how it enhances the Out of Distribution (OOD) accuracy as well as the privacy of the final learned model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.06557v1-abstract-full').style.display = 'none'; document.getElementById('2104.06557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICLR 2021 Distributed and Private Machine Learning(DPML) Workshop </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.01678">arXiv:2104.01678</a> <span> [<a href="https://arxiv.org/pdf/2104.01678">pdf</a>, <a href="https://arxiv.org/format/2104.01678">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Understanding Continual Learning Settings with Data Distribution Drift Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lesort%2C+T">Timoth茅e Lesort</a>, <a href="/search/cs?searchtype=author&query=Caccia%2C+M">Massimo Caccia</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.01678v2-abstract-short" style="display: inline;"> Classical machine learning algorithms often assume that the data are drawn i.i.d. from a stationary probability distribution. Recently, continual learning emerged as a rapidly growing area of machine learning where this assumption is relaxed, i.e. where the data distribution is non-stationary and changes over time. This paper represents the state of data distribution by a context variable $c$. A d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.01678v2-abstract-full').style.display = 'inline'; document.getElementById('2104.01678v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.01678v2-abstract-full" style="display: none;"> Classical machine learning algorithms often assume that the data are drawn i.i.d. from a stationary probability distribution. Recently, continual learning emerged as a rapidly growing area of machine learning where this assumption is relaxed, i.e. where the data distribution is non-stationary and changes over time. This paper represents the state of data distribution by a context variable $c$. A drift in $c$ leads to a data distribution drift. A context drift may change the target distribution, the input distribution, or both. Moreover, distribution drifts might be abrupt or gradual. In continual learning, context drifts may interfere with the learning process and erase previously learned knowledge; thus, continual learning algorithms must include specialized mechanisms to deal with such drifts. In this paper, we aim to identify and categorize different types of context drifts and potential assumptions about them, to better characterize various continual-learning scenarios. Moreover, we propose to use the distribution drift framework to provide more precise definitions of several terms commonly used in the continual learning field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.01678v2-abstract-full').style.display = 'none'; document.getElementById('2104.01678v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.13490">arXiv:2012.13490</a> <span> [<a href="https://arxiv.org/pdf/2012.13490">pdf</a>, <a href="https://arxiv.org/format/2012.13490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Continual Reinforcement Learning: A Review and Perspectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khetarpal%2C+K">Khimya Khetarpal</a>, <a href="/search/cs?searchtype=author&query=Riemer%2C+M">Matthew Riemer</a>, <a href="/search/cs?searchtype=author&query=Rish%2C+I">Irina Rish</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.13490v2-abstract-short" style="display: inline;"> In this article, we aim to provide a literature review of different formulations and approaches to continual reinforcement learning (RL), also known as lifelong or non-stationary RL. We begin by discussing our perspective on why RL is a natural fit for studying continual learning. We then provide a taxonomy of different continual RL formulations by mathematically characterizing two key properties… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.13490v2-abstract-full').style.display = 'inline'; document.getElementById('2012.13490v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.13490v2-abstract-full" style="display: none;"> In this article, we aim to provide a literature review of different formulations and approaches to continual reinforcement learning (RL), also known as lifelong or non-stationary RL. We begin by discussing our perspective on why RL is a natural fit for studying continual learning. We then provide a taxonomy of different continual RL formulations by mathematically characterizing two key properties of non-stationarity, namely, the scope and driver non-stationarity. This offers a unified view of various formulations. Next, we review and present a taxonomy of continual RL approaches. We go on to discuss evaluation of continual RL agents, providing an overview of benchmarks used in the literature and important metrics for understanding agent performance. Finally, we highlight open problems and challenges in bridging the gap between the current state of continual RL and findings in neuroscience. While still in its early days, the study of continual RL has the promise to develop better incremental reinforcement learners that can function in increasingly realistic applications where non-stationarity plays a vital role. These include applications such as those in the fields of healthcare, education, logistics, and robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.13490v2-abstract-full').style.display = 'none'; document.getElementById('2012.13490v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Journal of Artificial Intelligence Research (JAIR)</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Rish%2C+I&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Rish%2C+I&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Rish%2C+I&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository