CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;15 of 15 results for author: <span class="mathjax">Mirzadeh, I</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Mirzadeh%2C+I">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Mirzadeh, I"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Mirzadeh%2C+I&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Mirzadeh, I"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01798">arXiv:2411.01798</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01798">pdf</a>, <a href="https://arxiv.org/format/2411.01798">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SALSA: Soup-based Alignment Learning for Stronger Adaptation in RLHF </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chegini%2C+A">Atoosa Chegini</a>, <a href="/search/cs?searchtype=author&amp;query=Kazemi%2C+H">Hamid Kazemi</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+D">Dong Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Horton%2C+M">Maxwell Horton</a>, <a href="/search/cs?searchtype=author&amp;query=Nabi%2C+M">Moin Nabi</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a>, <a href="/search/cs?searchtype=author&amp;query=Alizadeh%2C+K">Keivan Alizadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01798v1-abstract-short" style="display: inline;"> In Large Language Model (LLM) development, Reinforcement Learning from Human Feedback (RLHF) is crucial for aligning models with human values and preferences. RLHF traditionally relies on the Kullback-Leibler (KL) divergence between the current policy and a frozen initial policy as a reference, which is added as a penalty in policy optimization algorithms like Proximal Policy Optimization (PPO). W&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01798v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01798v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01798v1-abstract-full" style="display: none;"> In Large Language Model (LLM) development, Reinforcement Learning from Human Feedback (RLHF) is crucial for aligning models with human values and preferences. RLHF traditionally relies on the Kullback-Leibler (KL) divergence between the current policy and a frozen initial policy as a reference, which is added as a penalty in policy optimization algorithms like Proximal Policy Optimization (PPO). While this constraint prevents models from deviating too far from the initial checkpoint, it limits exploration of the reward landscape, reducing the model&#39;s ability to discover higher-quality solutions. As a result, policy optimization is often trapped in a narrow region of the parameter space, leading to suboptimal alignment and performance. This paper presents SALSA (Soup-based Alignment Learning for Stronger Adaptation), a novel approach designed to overcome these limitations by creating a more flexible and better located reference model through weight-space averaging of two independent supervised fine-tuned (SFT) models. This model soup allows for larger deviation in KL divergence and exploring a promising region of the solution space without sacrificing stability. By leveraging this more robust reference model, SALSA fosters better exploration, achieving higher rewards and improving model robustness, out-of-distribution generalization, and performance. We validate the effectiveness of SALSA through extensive experiments on popular open models (Llama2-7B, Mistral-7B, and Gemma-2B) across various benchmarks (MT-Bench, Arena-Hard, UltraFeedback), where it consistently surpasses PPO by fostering deeper exploration and achieving superior alignment in LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01798v1-abstract-full').style.display = 'none'; document.getElementById('2411.01798v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19456">arXiv:2410.19456</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19456">pdf</a>, <a href="https://arxiv.org/format/2410.19456">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Computational Bottlenecks of Training Small-scale Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ashkboos%2C+S">Saleh Ashkboos</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Alizadeh%2C+K">Keivan Alizadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Sekhavat%2C+M+H">Mohammad Hossein Sekhavat</a>, <a href="/search/cs?searchtype=author&amp;query=Nabi%2C+M">Moin Nabi</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a>, <a href="/search/cs?searchtype=author&amp;query=Faghri%2C+F">Fartash Faghri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19456v2-abstract-short" style="display: inline;"> While large language models (LLMs) dominate the AI landscape, Small-scale large Language Models (SLMs) are gaining attention due to cost and efficiency demands from consumers. However, there is limited research on the training behavior and computational requirements of SLMs. In this study, we explore the computational bottlenecks of training SLMs (up to 2B parameters) by examining the effects of v&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19456v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19456v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19456v2-abstract-full" style="display: none;"> While large language models (LLMs) dominate the AI landscape, Small-scale large Language Models (SLMs) are gaining attention due to cost and efficiency demands from consumers. However, there is limited research on the training behavior and computational requirements of SLMs. In this study, we explore the computational bottlenecks of training SLMs (up to 2B parameters) by examining the effects of various hyperparameters and configurations, including GPU type, batch size, model size, communication protocol, attention type, and the number of GPUs. We assess these factors on popular cloud services using metrics such as loss per dollar and tokens per second. Our findings aim to support the broader adoption and optimization of language model training for low-resource AI research institutes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19456v2-abstract-full').style.display = 'none'; document.getElementById('2410.19456v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10846">arXiv:2410.10846</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10846">pdf</a>, <a href="https://arxiv.org/format/2410.10846">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Duo-LLM: A Framework for Studying Adaptive Computation in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alizadeh%2C+K">Keivan Alizadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Shahrokhi%2C+H">Hooman Shahrokhi</a>, <a href="/search/cs?searchtype=author&amp;query=Belenko%2C+D">Dmitry Belenko</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+F">Frank Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+M">Minsik Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Sekhavat%2C+M+H">Mohammad Hossein Sekhavat</a>, <a href="/search/cs?searchtype=author&amp;query=Nabi%2C+M">Moin Nabi</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10846v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) typically generate outputs token by token using a fixed compute budget, leading to inefficient resource utilization. To address this shortcoming, recent advancements in mixture of expert (MoE) models, speculative decoding, and early exit strategies leverage the insight that computational demands can vary significantly based on the complexity and nature of the input. Ho&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10846v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10846v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10846v1-abstract-full" style="display: none;"> Large Language Models (LLMs) typically generate outputs token by token using a fixed compute budget, leading to inefficient resource utilization. To address this shortcoming, recent advancements in mixture of expert (MoE) models, speculative decoding, and early exit strategies leverage the insight that computational demands can vary significantly based on the complexity and nature of the input. However, identifying optimal routing patterns for dynamic execution remains an open challenge, limiting the full potential of these adaptive methods. To address this need, we study adaptive computation in LLMs more systematically. We propose a novel framework that integrates smaller auxiliary modules within each Feed-Forward Network layer of the LLM. This design enables dynamic routing of tokens based on task complexity: tokens can be processed by either the small or big modules at each layer, or even bypass certain layers entirely. This allows us to introduce a novel notion of a token&#39;s difficulty, defined by its potential to benefit from additional computational resources. Importantly, by employing oracles to identify optimal patterns of adaptive computations, we gain valuable insights into the internal workings of LLMs and the routing processes in a simplified heterogeneous MoE setup. We show that trained routers operate differently from oracles and often yield suboptimal solutions. Notably, activating a large module in just one layer outperforms models that use large modules across all layers, underscoring the gap between practical implementations of routing in MoE models and theoretical optima for adaptive computation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10846v1-abstract-full').style.display = 'none'; document.getElementById('2410.10846v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05229">arXiv:2410.05229</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05229">pdf</a>, <a href="https://arxiv.org/format/2410.05229">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GSM-Symbolic: Understanding the Limitations of Mathematical Reasoning in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Alizadeh%2C+K">Keivan Alizadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Shahrokhi%2C+H">Hooman Shahrokhi</a>, <a href="/search/cs?searchtype=author&amp;query=Tuzel%2C+O">Oncel Tuzel</a>, <a href="/search/cs?searchtype=author&amp;query=Bengio%2C+S">Samy Bengio</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05229v1-abstract-short" style="display: inline;"> Recent advancements in Large Language Models (LLMs) have sparked interest in their formal reasoning capabilities, particularly in mathematics. The GSM8K benchmark is widely used to assess the mathematical reasoning of models on grade-school-level questions. While the performance of LLMs on GSM8K has significantly improved in recent years, it remains unclear whether their mathematical reasoning cap&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05229v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05229v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05229v1-abstract-full" style="display: none;"> Recent advancements in Large Language Models (LLMs) have sparked interest in their formal reasoning capabilities, particularly in mathematics. The GSM8K benchmark is widely used to assess the mathematical reasoning of models on grade-school-level questions. While the performance of LLMs on GSM8K has significantly improved in recent years, it remains unclear whether their mathematical reasoning capabilities have genuinely advanced, raising questions about the reliability of the reported metrics. To address these concerns, we conduct a large-scale study on several SOTA open and closed models. To overcome the limitations of existing evaluations, we introduce GSM-Symbolic, an improved benchmark created from symbolic templates that allow for the generation of a diverse set of questions. GSM-Symbolic enables more controllable evaluations, providing key insights and more reliable metrics for measuring the reasoning capabilities of models.Our findings reveal that LLMs exhibit noticeable variance when responding to different instantiations of the same question. Specifically, the performance of all models declines when only the numerical values in the question are altered in the GSM-Symbolic benchmark. Furthermore, we investigate the fragility of mathematical reasoning in these models and show that their performance significantly deteriorates as the number of clauses in a question increases. We hypothesize that this decline is because current LLMs cannot perform genuine logical reasoning; they replicate reasoning steps from their training data. Adding a single clause that seems relevant to the question causes significant performance drops (up to 65%) across all state-of-the-art models, even though the clause doesn&#39;t contribute to the reasoning chain needed for the final answer. Overall, our work offers a more nuanced understanding of LLMs&#39; capabilities and limitations in mathematical reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05229v1-abstract-full').style.display = 'none'; document.getElementById('2410.05229v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12903">arXiv:2409.12903</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.12903">pdf</a>, <a href="https://arxiv.org/format/2409.12903">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scaling Smart: Accelerating Large Language Model Pre-training with Small Model Initialization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Samragh%2C+M">Mohammad Samragh</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Vahid%2C+K+A">Keivan Alizadeh Vahid</a>, <a href="/search/cs?searchtype=author&amp;query=Faghri%2C+F">Fartash Faghri</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+M">Minsik Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Nabi%2C+M">Moin Nabi</a>, <a href="/search/cs?searchtype=author&amp;query=Naik%2C+D">Devang Naik</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12903v2-abstract-short" style="display: inline;"> The pre-training phase of language models often begins with randomly initialized parameters. With the current trends in scaling models, training their large number of parameters can be extremely slow and costly. In contrast, small language models are less expensive to train, but they often cannot achieve the accuracy of large models. In this paper, we explore an intriguing idea to connect these tw&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12903v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12903v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12903v2-abstract-full" style="display: none;"> The pre-training phase of language models often begins with randomly initialized parameters. With the current trends in scaling models, training their large number of parameters can be extremely slow and costly. In contrast, small language models are less expensive to train, but they often cannot achieve the accuracy of large models. In this paper, we explore an intriguing idea to connect these two different regimes: Can we develop a method to initialize large language models using smaller pre-trained models? Will such initialization bring any benefits in terms of training time and final accuracy? In this paper, we introduce HyperCloning, a method that can expand the parameters of a pre-trained language model to those of a larger model with increased hidden dimensions. Our method ensures that the larger model retains the functionality of the smaller model. As a result, the larger model already inherits the predictive power and accuracy of the smaller model before the training starts. We demonstrate that training such an initialized model results in significant savings in terms of GPU hours required for pre-training large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12903v2-abstract-full').style.display = 'none'; document.getElementById('2409.12903v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14619">arXiv:2404.14619</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.14619">pdf</a>, <a href="https://arxiv.org/format/2404.14619">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OpenELM: An Efficient Language Model Family with Open Training and Inference Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mehta%2C+S">Sachin Mehta</a>, <a href="/search/cs?searchtype=author&amp;query=Sekhavat%2C+M+H">Mohammad Hossein Sekhavat</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Q">Qingqing Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Horton%2C+M">Maxwell Horton</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yanzi Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+C">Chenfan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Najibi%2C+M">Mahyar Najibi</a>, <a href="/search/cs?searchtype=author&amp;query=Belenko%2C+D">Dmitry Belenko</a>, <a href="/search/cs?searchtype=author&amp;query=Zatloukal%2C+P">Peter Zatloukal</a>, <a href="/search/cs?searchtype=author&amp;query=Rastegari%2C+M">Mohammad Rastegari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14619v2-abstract-short" style="display: inline;"> The reproducibility and transparency of large language models are crucial for advancing open research, ensuring the trustworthiness of results, and enabling investigations into data and model biases, as well as potential risks. To this end, we release OpenELM, a state-of-the-art open language model. OpenELM uses a layer-wise scaling strategy to efficiently allocate parameters within each layer of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14619v2-abstract-full').style.display = 'inline'; document.getElementById('2404.14619v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14619v2-abstract-full" style="display: none;"> The reproducibility and transparency of large language models are crucial for advancing open research, ensuring the trustworthiness of results, and enabling investigations into data and model biases, as well as potential risks. To this end, we release OpenELM, a state-of-the-art open language model. OpenELM uses a layer-wise scaling strategy to efficiently allocate parameters within each layer of the transformer model, leading to enhanced accuracy. For example, with a parameter budget of approximately one billion parameters, OpenELM exhibits a 2.36% improvement in accuracy compared to OLMo while requiring $2\times$ fewer pre-training tokens. Diverging from prior practices that only provide model weights and inference code, and pre-train on private datasets, our release includes the complete framework for training and evaluation of the language model on publicly available datasets, including training logs, multiple checkpoints, and pre-training configurations. We also release code to convert models to MLX library for inference and fine-tuning on Apple devices. This comprehensive release aims to empower and strengthen the open research community, paving the way for future open research endeavors. Our source code along with pre-trained model weights and training recipes is available at \url{https://github.com/apple/corenet}. Additionally, \model models can be found on HuggingFace at: \url{https://huggingface.co/apple/OpenELM}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14619v2-abstract-full').style.display = 'none'; document.getElementById('2404.14619v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Minor corrections</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11514">arXiv:2312.11514</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.11514">pdf</a>, <a href="https://arxiv.org/format/2312.11514">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LLM in a flash: Efficient Large Language Model Inference with Limited Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alizadeh%2C+K">Keivan Alizadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Belenko%2C+D">Dmitry Belenko</a>, <a href="/search/cs?searchtype=author&amp;query=Khatamifard%2C+K">Karen Khatamifard</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+M">Minsik Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Del+Mundo%2C+C+C">Carlo C Del Mundo</a>, <a href="/search/cs?searchtype=author&amp;query=Rastegari%2C+M">Mohammad Rastegari</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11514v3-abstract-short" style="display: inline;"> Large language models (LLMs) are central to modern natural language processing, delivering exceptional performance in various tasks. However, their substantial computational and memory requirements present challenges, especially for devices with limited DRAM capacity. This paper tackles the challenge of efficiently running LLMs that exceed the available DRAM capacity by storing the model parameter&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11514v3-abstract-full').style.display = 'inline'; document.getElementById('2312.11514v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11514v3-abstract-full" style="display: none;"> Large language models (LLMs) are central to modern natural language processing, delivering exceptional performance in various tasks. However, their substantial computational and memory requirements present challenges, especially for devices with limited DRAM capacity. This paper tackles the challenge of efficiently running LLMs that exceed the available DRAM capacity by storing the model parameters in flash memory, but bringing them on demand to DRAM. Our method involves constructing an inference cost model that takes into account the characteristics of flash memory, guiding us to optimize in two critical areas: reducing the volume of data transferred from flash and reading data in larger, more contiguous chunks. Within this hardware-informed framework, we introduce two principal techniques. First, &#34;windowing&#34; strategically reduces data transfer by reusing previously activated neurons, and second, &#34;row-column bundling&#34;, tailored to the sequential data access strengths of flash memory, increases the size of data chunks read from flash memory. These methods collectively enable running models up to twice the size of the available DRAM, with a 4-5x and 20-25x increase in inference speed compared to naive loading approaches in CPU and GPU, respectively. Our integration of sparsity awareness, context-adaptive loading, and a hardware-oriented design paves the way for effective inference of LLMs on devices with limited memory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11514v3-abstract-full').style.display = 'none'; document.getElementById('2312.11514v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.04564">arXiv:2310.04564</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.04564">pdf</a>, <a href="https://arxiv.org/format/2310.04564">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ReLU Strikes Back: Exploiting Activation Sparsity in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Alizadeh%2C+K">Keivan Alizadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Mehta%2C+S">Sachin Mehta</a>, <a href="/search/cs?searchtype=author&amp;query=Del+Mundo%2C+C+C">Carlo C Del Mundo</a>, <a href="/search/cs?searchtype=author&amp;query=Tuzel%2C+O">Oncel Tuzel</a>, <a href="/search/cs?searchtype=author&amp;query=Samei%2C+G">Golnoosh Samei</a>, <a href="/search/cs?searchtype=author&amp;query=Rastegari%2C+M">Mohammad Rastegari</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.04564v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) with billions of parameters have drastically transformed AI applications. However, their demanding computation during inference has raised significant challenges for deployment on resource-constrained devices. Despite recent trends favoring alternative activation functions such as GELU or SiLU, known for increased computation, this study strongly advocates for reinstat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04564v1-abstract-full').style.display = 'inline'; document.getElementById('2310.04564v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.04564v1-abstract-full" style="display: none;"> Large Language Models (LLMs) with billions of parameters have drastically transformed AI applications. However, their demanding computation during inference has raised significant challenges for deployment on resource-constrained devices. Despite recent trends favoring alternative activation functions such as GELU or SiLU, known for increased computation, this study strongly advocates for reinstating ReLU activation in LLMs. We demonstrate that using the ReLU activation function has a negligible impact on convergence and performance while significantly reducing computation and weight transfer. This reduction is particularly valuable during the memory-bound inference step, where efficiency is paramount. Exploring sparsity patterns in ReLU-based LLMs, we unveil the reutilization of activated neurons for generating new tokens and leveraging these insights, we propose practical strategies to substantially reduce LLM inference computation up to three times, using ReLU activations with minimal performance trade-offs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04564v1-abstract-full').style.display = 'none'; document.getElementById('2310.04564v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.09826">arXiv:2202.09826</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.09826">pdf</a>, <a href="https://arxiv.org/format/2202.09826">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Continual Learning Beyond a Single Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Doan%2C+T">Thang Doan</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+S+I">Seyed Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.09826v3-abstract-short" style="display: inline;"> A growing body of research in continual learning focuses on the catastrophic forgetting problem. While many attempts have been made to alleviate this problem, the majority of the methods assume a single model in the continual learning setup. In this work, we question this assumption and show that employing ensemble models can be a simple yet effective method to improve continual performance. Howev&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.09826v3-abstract-full').style.display = 'inline'; document.getElementById('2202.09826v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.09826v3-abstract-full" style="display: none;"> A growing body of research in continual learning focuses on the catastrophic forgetting problem. While many attempts have been made to alleviate this problem, the majority of the methods assume a single model in the continual learning setup. In this work, we question this assumption and show that employing ensemble models can be a simple yet effective method to improve continual performance. However, ensembles&#39; training and inference costs can increase significantly as the number of models grows. Motivated by this limitation, we study different ensemble models to understand their benefits and drawbacks in continual learning scenarios. Finally, to overcome the high compute cost of ensembles, we leverage recent advances in neural network subspace to propose a computationally cheap algorithm with similar runtime to a single model yet enjoying the performance benefits of ensembles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.09826v3-abstract-full').style.display = 'none'; document.getElementById('2202.09826v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to 2nd Conference on Lifelong Learning Agents (CoLLAs 2023); Keywords: continual learning, neural network subspaces, ensemble models, computationally efficient training</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.00275">arXiv:2202.00275</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.00275">pdf</a>, <a href="https://arxiv.org/format/2202.00275">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Architecture Matters in Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+S+I">Seyed Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhry%2C+A">Arslan Chaudhry</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+D">Dong Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Timothy Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Pascanu%2C+R">Razvan Pascanu</a>, <a href="/search/cs?searchtype=author&amp;query=Gorur%2C+D">Dilan Gorur</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.00275v1-abstract-short" style="display: inline;"> A large body of research in continual learning is devoted to overcoming the catastrophic forgetting of neural networks by designing new algorithms that are robust to the distribution shifts. However, the majority of these works are strictly focused on the &#34;algorithmic&#34; part of continual learning for a &#34;fixed neural network architecture&#34;, and the implications of using different architectures are mo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.00275v1-abstract-full').style.display = 'inline'; document.getElementById('2202.00275v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.00275v1-abstract-full" style="display: none;"> A large body of research in continual learning is devoted to overcoming the catastrophic forgetting of neural networks by designing new algorithms that are robust to the distribution shifts. However, the majority of these works are strictly focused on the &#34;algorithmic&#34; part of continual learning for a &#34;fixed neural network architecture&#34;, and the implications of using different architectures are mostly neglected. Even the few existing continual learning methods that modify the model assume a fixed architecture and aim to develop an algorithm that efficiently uses the model throughout the learning experience. However, in this work, we show that the choice of architecture can significantly impact the continual learning performance, and different architectures lead to different trade-offs between the ability to remember previous tasks and learning new ones. Moreover, we study the impact of various architectural decisions, and our findings entail best practices and recommendations that can improve the continual learning performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.00275v1-abstract-full').style.display = 'none'; document.getElementById('2202.00275v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.11526">arXiv:2110.11526</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.11526">pdf</a>, <a href="https://arxiv.org/format/2110.11526">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Wide Neural Networks Forget Less Catastrophically </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+S+I">Seyed Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhry%2C+A">Arslan Chaudhry</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+D">Dong Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Huiyi Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Pascanu%2C+R">Razvan Pascanu</a>, <a href="/search/cs?searchtype=author&amp;query=Gorur%2C+D">Dilan Gorur</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.11526v3-abstract-short" style="display: inline;"> A primary focus area in continual learning research is alleviating the &#34;catastrophic forgetting&#34; problem in neural networks by designing new algorithms that are more robust to the distribution shifts. While the recent progress in continual learning literature is encouraging, our understanding of what properties of neural networks contribute to catastrophic forgetting is still limited. To address t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.11526v3-abstract-full').style.display = 'inline'; document.getElementById('2110.11526v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.11526v3-abstract-full" style="display: none;"> A primary focus area in continual learning research is alleviating the &#34;catastrophic forgetting&#34; problem in neural networks by designing new algorithms that are more robust to the distribution shifts. While the recent progress in continual learning literature is encouraging, our understanding of what properties of neural networks contribute to catastrophic forgetting is still limited. To address this, instead of focusing on continual learning algorithms, in this work, we focus on the model itself and study the impact of &#34;width&#34; of the neural network architecture on catastrophic forgetting, and show that width has a surprisingly significant effect on forgetting. To explain this effect, we study the learning dynamics of the network from various perspectives such as gradient orthogonality, sparsity, and lazy training regime. We provide potential explanations that are consistent with the empirical results across different architectures and continual learning benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.11526v3-abstract-full').style.display = 'none'; document.getElementById('2110.11526v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.00693">arXiv:2107.00693</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.00693">pdf</a>, <a href="https://arxiv.org/format/2107.00693">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Inter-Beat Interval Estimation with Tiramisu Model: A Novel Approach with Reduced Error </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Arefeen%2C+A">Asiful Arefeen</a>, <a href="/search/cs?searchtype=author&amp;query=Akbari%2C+A">Ali Akbari</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+S+I">Seyed Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Jafari%2C+R">Roozbeh Jafari</a>, <a href="/search/cs?searchtype=author&amp;query=Shirazi%2C+B+A">Behrooz A. Shirazi</a>, <a href="/search/cs?searchtype=author&amp;query=Ghasemzadeh%2C+H">Hassan Ghasemzadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.00693v1-abstract-short" style="display: inline;"> Inter-beat interval (IBI) measurement enables estimation of heart-rate variability (HRV) which, in turns, can provide early indication of potential cardiovascular diseases. However, extracting IBIs from noisy signals is challenging since the morphology of the signal is distorted in the presence of the noise. Electrocardiogram (ECG) of a person in heavy motion is highly corrupted with noise, known&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.00693v1-abstract-full').style.display = 'inline'; document.getElementById('2107.00693v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.00693v1-abstract-full" style="display: none;"> Inter-beat interval (IBI) measurement enables estimation of heart-rate variability (HRV) which, in turns, can provide early indication of potential cardiovascular diseases. However, extracting IBIs from noisy signals is challenging since the morphology of the signal is distorted in the presence of the noise. Electrocardiogram (ECG) of a person in heavy motion is highly corrupted with noise, known as motion-artifact, and IBI extracted from it is inaccurate. As a part of remote health monitoring and wearable system development, denoising ECG signals and estimating IBIs correctly from them have become an emerging topic among signal-processing researchers. Apart from conventional methods, deep-learning techniques have been successfully used in signal denoising recently, and diagnosis process has become easier, leading to accuracy levels that were previously unachievable. We propose a deep-learning approach leveraging tiramisu autoencoder model to suppress motion-artifact noise and make the R-peaks of the ECG signal prominent even in the presence of high-intensity motion. After denoising, IBIs are estimated more accurately expediting diagnosis tasks. Results illustrate that our method enables IBI estimation from noisy ECG signals with SNR up to -30dB with average root mean square error (RMSE) of 13 milliseconds for estimated IBIs. At this noise level, our error percentage remains below 8% and outperforms other state of the art techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.00693v1-abstract-full').style.display = 'none'; document.getElementById('2107.00693v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.04495">arXiv:2010.04495</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.04495">pdf</a>, <a href="https://arxiv.org/format/2010.04495">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Linear Mode Connectivity in Multitask and Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+S+I">Seyed Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a>, <a href="/search/cs?searchtype=author&amp;query=Gorur%2C+D">Dilan Gorur</a>, <a href="/search/cs?searchtype=author&amp;query=Pascanu%2C+R">Razvan Pascanu</a>, <a href="/search/cs?searchtype=author&amp;query=Ghasemzadeh%2C+H">Hassan Ghasemzadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.04495v1-abstract-short" style="display: inline;"> Continual (sequential) training and multitask (simultaneous) training are often attempting to solve the same overall objective: to find a solution that performs well on all considered tasks. The main difference is in the training regimes, where continual learning can only have access to one task at a time, which for neural networks typically leads to catastrophic forgetting. That is, the solution&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.04495v1-abstract-full').style.display = 'inline'; document.getElementById('2010.04495v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.04495v1-abstract-full" style="display: none;"> Continual (sequential) training and multitask (simultaneous) training are often attempting to solve the same overall objective: to find a solution that performs well on all considered tasks. The main difference is in the training regimes, where continual learning can only have access to one task at a time, which for neural networks typically leads to catastrophic forgetting. That is, the solution found for a subsequent task does not perform well on the previous ones anymore. However, the relationship between the different minima that the two training regimes arrive at is not well understood. What sets them apart? Is there a local structure that could explain the difference in performance achieved by the two different schemes? Motivated by recent work showing that different minima of the same task are typically connected by very simple curves of low error, we investigate whether multitask and continual solutions are similarly connected. We empirically find that indeed such connectivity can be reliably achieved and, more interestingly, it can be done by a linear path, conditioned on having the same initialization for both. We thoroughly analyze this observation and discuss its significance for the continual learning process. Furthermore, we exploit this finding to propose an effective algorithm that constrains the sequentially learned minima to behave as the multitask solution. We show that our method outperforms several state of the art continual learning algorithms on various vision benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.04495v1-abstract-full').style.display = 'none'; document.getElementById('2010.04495v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.06958">arXiv:2006.06958</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.06958">pdf</a>, <a href="https://arxiv.org/format/2006.06958">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Understanding the Role of Training Regimes in Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+S+I">Seyed Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Farajtabar%2C+M">Mehrdad Farajtabar</a>, <a href="/search/cs?searchtype=author&amp;query=Pascanu%2C+R">Razvan Pascanu</a>, <a href="/search/cs?searchtype=author&amp;query=Ghasemzadeh%2C+H">Hassan Ghasemzadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.06958v1-abstract-short" style="display: inline;"> Catastrophic forgetting affects the training of neural networks, limiting their ability to learn multiple tasks sequentially. From the perspective of the well established plasticity-stability dilemma, neural networks tend to be overly plastic, lacking the stability necessary to prevent the forgetting of previous knowledge, which means that as learning progresses, networks tend to forget previously&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.06958v1-abstract-full').style.display = 'inline'; document.getElementById('2006.06958v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.06958v1-abstract-full" style="display: none;"> Catastrophic forgetting affects the training of neural networks, limiting their ability to learn multiple tasks sequentially. From the perspective of the well established plasticity-stability dilemma, neural networks tend to be overly plastic, lacking the stability necessary to prevent the forgetting of previous knowledge, which means that as learning progresses, networks tend to forget previously seen tasks. This phenomenon coined in the continual learning literature, has attracted much attention lately, and several families of approaches have been proposed with different degrees of success. However, there has been limited prior work extensively analyzing the impact that different training regimes -- learning rate, batch size, regularization method-- can have on forgetting. In this work, we depart from the typical approach of altering the learning algorithm to improve stability. Instead, we hypothesize that the geometrical properties of the local minima found for each task play an important role in the overall degree of forgetting. In particular, we study the effect of dropout, learning rate decay, and batch size, on forming training regimes that widen the tasks&#39; local minima and consequently, on helping it not to forget catastrophically. Our study provides practical insights to improve stability via simple yet effective techniques that outperform alternative baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.06958v1-abstract-full').style.display = 'none'; document.getElementById('2006.06958v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.07415">arXiv:2003.07415</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2003.07415">pdf</a>, <a href="https://arxiv.org/format/2003.07415">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> ActiLabel: A Combinatorial Transfer Learning Framework for Activity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alinia%2C+P">Parastoo Alinia</a>, <a href="/search/cs?searchtype=author&amp;query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Ghasemzadeh%2C+H">Hassan Ghasemzadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.07415v1-abstract-short" style="display: inline;"> Sensor-based human activity recognition has become a critical component of many emerging applications ranging from behavioral medicine to gaming. However, an unprecedented increase in the diversity of sensor devices in the Internet-of-Things era has limited the adoption of activity recognition models for use across different domains. We propose ActiLabel a combinatorial framework that learns struc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.07415v1-abstract-full').style.display = 'inline'; document.getElementById('2003.07415v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.07415v1-abstract-full" style="display: none;"> Sensor-based human activity recognition has become a critical component of many emerging applications ranging from behavioral medicine to gaming. However, an unprecedented increase in the diversity of sensor devices in the Internet-of-Things era has limited the adoption of activity recognition models for use across different domains. We propose ActiLabel a combinatorial framework that learns structural similarities among the events in an arbitrary domain and those of a different domain. The structural similarities are captured through a graph model, referred to as the it dependency graph, which abstracts details of activity patterns in low-level signal and feature space. The activity labels are then autonomously learned by finding an optimal tiered mapping between the dependency graphs. Extensive experiments based on three public datasets demonstrate the superiority of ActiLabel over state-of-the-art transfer learning and deep learning methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.07415v1-abstract-full').style.display = 'none'; document.getElementById('2003.07415v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10