CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 68 results for author: <span class="mathjax">Smith, V</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Smith%2C+V">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Smith, V"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Smith%2C+V&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Smith, V"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Smith%2C+V&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Smith%2C+V&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Smith%2C+V&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03730">arXiv:2411.03730</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03730">pdf</a>, <a href="https://arxiv.org/format/2411.03730">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeurIPS 2023 Competition: Privacy Preserving Federated Learning Document VQA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tobaben%2C+M">Marlon Tobaben</a>, <a href="/search/cs?searchtype=author&amp;query=Souibgui%2C+M+A">Mohamed Ali Souibgui</a>, <a href="/search/cs?searchtype=author&amp;query=Tito%2C+R">Rub猫n Tito</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+K">Khanh Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Kerkouche%2C+R">Raouf Kerkouche</a>, <a href="/search/cs?searchtype=author&amp;query=Jung%2C+K">Kangsoo Jung</a>, <a href="/search/cs?searchtype=author&amp;query=J%C3%A4lk%C3%B6%2C+J">Joonas J盲lk枚</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+L">Lei Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Barsky%2C+A">Andrey Barsky</a>, <a href="/search/cs?searchtype=author&amp;query=d%27Andecy%2C+V+P">Vincent Poulain d&#39;Andecy</a>, <a href="/search/cs?searchtype=author&amp;query=Joseph%2C+A">Aur茅lie Joseph</a>, <a href="/search/cs?searchtype=author&amp;query=Muhamed%2C+A">Aashiq Muhamed</a>, <a href="/search/cs?searchtype=author&amp;query=Kuo%2C+K">Kevin Kuo</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Yamasaki%2C+Y">Yusuke Yamasaki</a>, <a href="/search/cs?searchtype=author&amp;query=Fukami%2C+T">Takumi Fukami</a>, <a href="/search/cs?searchtype=author&amp;query=Niwa%2C+K">Kenta Niwa</a>, <a href="/search/cs?searchtype=author&amp;query=Tyou%2C+I">Iifan Tyou</a>, <a href="/search/cs?searchtype=author&amp;query=Ishii%2C+H">Hiro Ishii</a>, <a href="/search/cs?searchtype=author&amp;query=Yokota%2C+R">Rio Yokota</a>, <a href="/search/cs?searchtype=author&amp;query=N%2C+R">Ragul N</a>, <a href="/search/cs?searchtype=author&amp;query=Kutum%2C+R">Rintu Kutum</a>, <a href="/search/cs?searchtype=author&amp;query=Llados%2C+J">Josep Llados</a>, <a href="/search/cs?searchtype=author&amp;query=Valveny%2C+E">Ernest Valveny</a>, <a href="/search/cs?searchtype=author&amp;query=Honkela%2C+A">Antti Honkela</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03730v1-abstract-short" style="display: inline;"> The Privacy Preserving Federated Learning Document VQA (PFL-DocVQA) competition challenged the community to develop provably private and communication-efficient solutions in a federated setting for a real-life use case: invoice processing. The competition introduced a dataset of real invoice documents, along with associated questions and answers requiring information extraction and reasoning over&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03730v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03730v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03730v1-abstract-full" style="display: none;"> The Privacy Preserving Federated Learning Document VQA (PFL-DocVQA) competition challenged the community to develop provably private and communication-efficient solutions in a federated setting for a real-life use case: invoice processing. The competition introduced a dataset of real invoice documents, along with associated questions and answers requiring information extraction and reasoning over the document images. Thereby, it brings together researchers and expertise from the document analysis, privacy, and federated learning communities. Participants fine-tuned a pre-trained, state-of-the-art Document Visual Question Answering model provided by the organizers for this new domain, mimicking a typical federated invoice processing setup. The base model is a multi-modal generative language model, and sensitive information could be exposed through either the visual or textual input modality. Participants proposed elegant solutions to reduce communication costs while maintaining a minimum utility threshold in track 1 and to protect all information from each document provider using differential privacy in track 2. The competition served as a new testbed for developing and testing private federated learning methods, simultaneously raising awareness about privacy within the document image analysis and recognition community. Ultimately, the competition analysis provides best practices and recommendations for successfully running privacy-focused federated learning challenges in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03730v1-abstract-full').style.display = 'none'; document.getElementById('2411.03730v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00743">arXiv:2411.00743</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00743">pdf</a>, <a href="https://arxiv.org/format/2411.00743">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Decoding Dark Matter: Specialized Sparse Autoencoders for Interpreting Rare Concepts in Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Muhamed%2C+A">Aashiq Muhamed</a>, <a href="/search/cs?searchtype=author&amp;query=Diab%2C+M">Mona Diab</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00743v1-abstract-short" style="display: inline;"> Understanding and mitigating the potential risks associated with foundation models (FMs) hinges on developing effective interpretability methods. Sparse Autoencoders (SAEs) have emerged as a promising tool for disentangling FM representations, but they struggle to capture rare, yet crucial concepts in the data. We introduce Specialized Sparse Autoencoders (SSAEs), designed to illuminate these elus&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00743v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00743v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00743v1-abstract-full" style="display: none;"> Understanding and mitigating the potential risks associated with foundation models (FMs) hinges on developing effective interpretability methods. Sparse Autoencoders (SAEs) have emerged as a promising tool for disentangling FM representations, but they struggle to capture rare, yet crucial concepts in the data. We introduce Specialized Sparse Autoencoders (SSAEs), designed to illuminate these elusive dark matter features by focusing on specific subdomains. We present a practical recipe for training SSAEs, demonstrating the efficacy of dense retrieval for data selection and the benefits of Tilted Empirical Risk Minimization as a training objective to improve concept recall. Our evaluation of SSAEs on standard metrics, such as downstream perplexity and $L_0$ sparsity, show that they effectively capture subdomain tail concepts, exceeding the capabilities of general-purpose SAEs. We showcase the practical utility of SSAEs in a case study on the Bias in Bios dataset, where SSAEs achieve a 12.5\% increase in worst-group classification accuracy when applied to remove spurious gender information. SSAEs provide a powerful new lens for peering into the inner workings of FMs in subdomains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00743v1-abstract-full').style.display = 'none'; document.getElementById('2411.00743v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02879">arXiv:2410.02879</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02879">pdf</a>, <a href="https://arxiv.org/format/2410.02879">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Position: LLM Unlearning Benchmarks are Weak Measures of Progress </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Thaker%2C+P">Pratiksha Thaker</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengyuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Kale%2C+N">Neil Kale</a>, <a href="/search/cs?searchtype=author&amp;query=Maurya%2C+Y">Yash Maurya</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z+S">Zhiwei Steven Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02879v1-abstract-short" style="display: inline;"> Unlearning methods have the potential to improve the privacy and safety of large language models (LLMs) by removing sensitive or harmful information post hoc. The LLM unlearning research community has increasingly turned toward empirical benchmarks to assess the effectiveness of such methods. In this paper, we find that existing benchmarks provide an overly optimistic and potentially misleading vi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02879v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02879v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02879v1-abstract-full" style="display: none;"> Unlearning methods have the potential to improve the privacy and safety of large language models (LLMs) by removing sensitive or harmful information post hoc. The LLM unlearning research community has increasingly turned toward empirical benchmarks to assess the effectiveness of such methods. In this paper, we find that existing benchmarks provide an overly optimistic and potentially misleading view on the effectiveness of candidate unlearning methods. By introducing simple, benign modifications to a number of popular benchmarks, we expose instances where supposedly unlearned information remains accessible, or where the unlearning process has degraded the model&#39;s performance on retained information to a much greater extent than indicated by the original benchmark. We identify that existing benchmarks are particularly vulnerable to modifications that introduce even loose dependencies between the forget and retain information. Further, we show that ambiguity in unlearning targets in existing benchmarks can easily lead to the design of methods that overfit to the given test queries. Based on our findings, we urge the community to be cautious when interpreting benchmark results as reliable measures of progress, and we provide several recommendations to guide future LLM unlearning research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02879v1-abstract-full').style.display = 'none'; document.getElementById('2410.02879v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02348">arXiv:2407.02348</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.02348">pdf</a>, <a href="https://arxiv.org/format/2407.02348">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Cascaded Ensembles for Efficient Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kolawole%2C+S">Steven Kolawole</a>, <a href="/search/cs?searchtype=author&amp;query=Dennis%2C+D">Don Dennis</a>, <a href="/search/cs?searchtype=author&amp;query=Talwalkar%2C+A">Ameet Talwalkar</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02348v1-abstract-short" style="display: inline;"> A common approach to make machine learning inference more efficient is to use example-specific adaptive schemes, which route or select models for each example at inference time. In this work we study a simple scheme for adaptive inference. We build a cascade of ensembles (CoE), beginning with resource-efficient models and growing to larger, more expressive models, where ensemble agreement serves a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02348v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02348v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02348v1-abstract-full" style="display: none;"> A common approach to make machine learning inference more efficient is to use example-specific adaptive schemes, which route or select models for each example at inference time. In this work we study a simple scheme for adaptive inference. We build a cascade of ensembles (CoE), beginning with resource-efficient models and growing to larger, more expressive models, where ensemble agreement serves as a data-dependent routing criterion. This scheme is easy to incorporate into existing inference pipelines, requires no additional training, and can be used to place models across multiple resource tiers--for instance, serving efficient models at the edge and invoking larger models in the cloud only when necessary. In cases where parallel inference is feasible, we show that CoE can improve accuracy relative to the single best model while reducing the average cost of inference by up to 7x, and provides Pareto-dominate solutions in accuracy and efficiency relative to existing adaptive inference baselines. These savings translate to an over 3x-reduction in total monetary cost when performing inference using a heterogeneous cluster of GPUs. Finally, for edge inference scenarios where portions of the cascade reside at the edge vs. in the cloud, CoE can provide a 14x reduction in communication cost and inference latency without sacrificing accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02348v1-abstract-full').style.display = 'none'; document.getElementById('2407.02348v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ES-FOMO, ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17660">arXiv:2406.17660</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.17660">pdf</a>, <a href="https://arxiv.org/format/2406.17660">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Grass: Compute Efficient Low-Memory LLM Training with Structured Sparse Gradients </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Muhamed%2C+A">Aashiq Muhamed</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+O">Oscar Li</a>, <a href="/search/cs?searchtype=author&amp;query=Woodruff%2C+D">David Woodruff</a>, <a href="/search/cs?searchtype=author&amp;query=Diab%2C+M">Mona Diab</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17660v1-abstract-short" style="display: inline;"> Large language model (LLM) training and finetuning are often bottlenecked by limited GPU memory. While existing projection-based optimization methods address this by projecting gradients into a lower-dimensional subspace to reduce optimizer state memory, they typically rely on dense projection matrices, which can introduce computational and memory overheads. In this work, we propose Grass (GRAdien&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17660v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17660v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17660v1-abstract-full" style="display: none;"> Large language model (LLM) training and finetuning are often bottlenecked by limited GPU memory. While existing projection-based optimization methods address this by projecting gradients into a lower-dimensional subspace to reduce optimizer state memory, they typically rely on dense projection matrices, which can introduce computational and memory overheads. In this work, we propose Grass (GRAdient Stuctured Sparsification), a novel approach that leverages sparse projections to transform gradients into structured sparse updates. This design not only significantly reduces memory usage for optimizer states but also minimizes gradient memory footprint, computation, and communication costs, leading to substantial throughput improvements. Extensive experiments on pretraining and finetuning tasks demonstrate that Grass achieves competitive performance to full-rank training and existing projection-based methods. Notably, Grass enables half-precision pretraining of a 13B parameter LLaMA model on a single 40GB A100 GPU--a feat infeasible for previous methods--and yields up to a $2\times$ throughput improvement on an 8-GPU system. Code can be found at https://github.com/aashiqmuhamed/GRASS . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17660v1-abstract-full').style.display = 'none'; document.getElementById('2406.17660v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14532">arXiv:2406.14532</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.14532">pdf</a>, <a href="https://arxiv.org/format/2406.14532">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RL on Incorrect Synthetic Data Scales the Efficiency of LLM Math Reasoning by Eight-Fold </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Setlur%2C+A">Amrith Setlur</a>, <a href="/search/cs?searchtype=author&amp;query=Garg%2C+S">Saurabh Garg</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+X">Xinyang Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Garg%2C+N">Naman Garg</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+A">Aviral Kumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14532v1-abstract-short" style="display: inline;"> Training on model-generated synthetic data is a promising approach for finetuning LLMs, but it remains unclear when it helps or hurts. In this paper, we investigate this question for math reasoning via an empirical study, followed by building a conceptual understanding of our observations. First, we find that while the typical approach of finetuning a model on synthetic correct or positive problem&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14532v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14532v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14532v1-abstract-full" style="display: none;"> Training on model-generated synthetic data is a promising approach for finetuning LLMs, but it remains unclear when it helps or hurts. In this paper, we investigate this question for math reasoning via an empirical study, followed by building a conceptual understanding of our observations. First, we find that while the typical approach of finetuning a model on synthetic correct or positive problem-solution pairs generated by capable models offers modest performance gains, sampling more correct solutions from the finetuned learner itself followed by subsequent fine-tuning on this self-generated data $\textbf{doubles}$ the efficiency of the same synthetic problems. At the same time, training on model-generated positives can amplify various spurious correlations, resulting in flat or even inverse scaling trends as the amount of data increases. Surprisingly, we find that several of these issues can be addressed if we also utilize negative responses, i.e., model-generated responses that are deemed incorrect by a final answer verifier. Crucially, these negatives must be constructed such that the training can appropriately recover the utility or advantage of each intermediate step in the negative response. With this per-step scheme, we are able to attain consistent gains over only positive data, attaining performance similar to amplifying the amount of synthetic data by $\mathbf{8 \times}$. We show that training on per-step negatives can help to unlearn spurious correlations in the positive data, and is equivalent to advantage-weighted reinforcement learning (RL), implying that it inherits robustness benefits of RL over imitating positive data alone. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14532v1-abstract-full').style.display = 'none'; document.getElementById('2406.14532v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13356">arXiv:2406.13356</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13356">pdf</a>, <a href="https://arxiv.org/format/2406.13356">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Jogging the Memory of Unlearned LLMs Through Targeted Relearning Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengyuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Y">Yiwei Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z+S">Zhiwei Steven Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13356v3-abstract-short" style="display: inline;"> Machine unlearning is a promising approach to mitigate undesirable memorization of training data in LLMs. However, in this work we show that existing approaches for unlearning in LLMs are surprisingly susceptible to a simple set of targeted relearning attacks. With access to only a small and potentially loosely related set of data, we find that we can &#34;jog&#34; the memory of unlearned models to revers&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13356v3-abstract-full').style.display = 'inline'; document.getElementById('2406.13356v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13356v3-abstract-full" style="display: none;"> Machine unlearning is a promising approach to mitigate undesirable memorization of training data in LLMs. However, in this work we show that existing approaches for unlearning in LLMs are surprisingly susceptible to a simple set of targeted relearning attacks. With access to only a small and potentially loosely related set of data, we find that we can &#34;jog&#34; the memory of unlearned models to reverse the effects of unlearning. For example, we show that relearning on public medical articles can lead an unlearned LLM to output harmful knowledge about bioweapons, and relearning general wiki information about the book series Harry Potter can force the model to output verbatim memorized text. We formalize this unlearning-relearning pipeline, explore the attack across three popular unlearning benchmarks, and discuss future directions and guidelines that result from our study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13356v3-abstract-full').style.display = 'none'; document.getElementById('2406.13356v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 5 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05233">arXiv:2406.05233</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.05233">pdf</a>, <a href="https://arxiv.org/format/2406.05233">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Federated LoRA with Sparse Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kuo%2C+K">Kevin Kuo</a>, <a href="/search/cs?searchtype=author&amp;query=Raje%2C+A">Arian Raje</a>, <a href="/search/cs?searchtype=author&amp;query=Rajesh%2C+K">Kousik Rajesh</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05233v1-abstract-short" style="display: inline;"> Low-rank adaptation (LoRA) is a natural method for finetuning in communication-constrained machine learning settings such as cross-device federated learning. Prior work that has studied LoRA in the context of federated learning has focused on improving LoRA&#39;s robustness to heterogeneity and privacy. In this work, we instead consider techniques for further improving communication-efficiency in fede&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05233v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05233v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05233v1-abstract-full" style="display: none;"> Low-rank adaptation (LoRA) is a natural method for finetuning in communication-constrained machine learning settings such as cross-device federated learning. Prior work that has studied LoRA in the context of federated learning has focused on improving LoRA&#39;s robustness to heterogeneity and privacy. In this work, we instead consider techniques for further improving communication-efficiency in federated LoRA. Unfortunately, we show that centralized ML methods that improve the efficiency of LoRA through unstructured pruning do not transfer well to federated settings. We instead study a simple approach, \textbf{FLASC}, that applies sparsity to LoRA during communication while allowing clients to locally fine-tune the entire LoRA module. Across four common federated learning tasks, we demonstrate that this method matches the performance of dense LoRA with up to $10\times$ less communication. Additionally, despite being designed primarily to target communication, we find that this approach has benefits in terms of heterogeneity and privacy relative to existing approaches tailored to these specific concerns. Overall, our work highlights the importance of considering system-specific constraints when developing communication-efficient finetuning approaches, and serves as a simple and competitive baseline for future work in federated finetuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05233v1-abstract-full').style.display = 'none'; document.getElementById('2406.05233v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages (excluding references), 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05598">arXiv:2403.05598</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.05598">pdf</a>, <a href="https://arxiv.org/format/2403.05598">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Privacy Amplification for the Gaussian Mechanism via Bounded Support </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengyuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Mahloujifar%2C+S">Saeed Mahloujifar</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhuri%2C+K">Kamalika Chaudhuri</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+C">Chuan Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05598v1-abstract-short" style="display: inline;"> Data-dependent privacy accounting frameworks such as per-instance differential privacy (pDP) and Fisher information loss (FIL) confer fine-grained privacy guarantees for individuals in a fixed training dataset. These guarantees can be desirable compared to vanilla DP in real world settings as they tightly upper-bound the privacy leakage for a $\textit{specific}$ individual in an $\textit{actual}$&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05598v1-abstract-full').style.display = 'inline'; document.getElementById('2403.05598v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05598v1-abstract-full" style="display: none;"> Data-dependent privacy accounting frameworks such as per-instance differential privacy (pDP) and Fisher information loss (FIL) confer fine-grained privacy guarantees for individuals in a fixed training dataset. These guarantees can be desirable compared to vanilla DP in real world settings as they tightly upper-bound the privacy leakage for a $\textit{specific}$ individual in an $\textit{actual}$ dataset, rather than considering worst-case datasets. While these frameworks are beginning to gain popularity, to date, there is a lack of private mechanisms that can fully leverage advantages of data-dependent accounting. To bridge this gap, we propose simple modifications of the Gaussian mechanism with bounded support, showing that they amplify privacy guarantees under data-dependent accounting. Experiments on model training with DP-SGD show that using bounded support Gaussian mechanisms can provide a reduction of the pDP bound $蔚$ by as much as 30% without negative effects on model utility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05598v1-abstract-full').style.display = 'none'; document.getElementById('2403.05598v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.04099">arXiv:2403.04099</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.04099">pdf</a>, <a href="https://arxiv.org/format/2403.04099">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Many-Objective Multi-Solution Transport </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Ziyue Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Bilmes%2C+J">Jeff Bilmes</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+T">Tianyi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.04099v1-abstract-short" style="display: inline;"> Optimizing the performance of many objectives (instantiated by tasks or clients) jointly with a few Pareto stationary solutions (models) is critical in machine learning. However, previous multi-objective optimization methods often focus on a few number of objectives and cannot scale to many objectives that outnumber the solutions, leading to either subpar performance or ignored objectives. We intr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04099v1-abstract-full').style.display = 'inline'; document.getElementById('2403.04099v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.04099v1-abstract-full" style="display: none;"> Optimizing the performance of many objectives (instantiated by tasks or clients) jointly with a few Pareto stationary solutions (models) is critical in machine learning. However, previous multi-objective optimization methods often focus on a few number of objectives and cannot scale to many objectives that outnumber the solutions, leading to either subpar performance or ignored objectives. We introduce Many-objective multi-solution Transport (MosT), a framework that finds multiple diverse solutions in the Pareto front of many objectives. Our insight is to seek multiple solutions, each performing as a domain expert and focusing on a specific subset of objectives while collectively covering all of them. MosT formulates the problem as a bi-level optimization of weighted objectives for each solution, where the weights are defined by an optimal transport between the objectives and solutions. Our algorithm ensures convergence to Pareto stationary solutions for complementary subsets of objectives. On a range of applications in federated learning, multi-task learning, and mixture-of-prompt learning for LLMs, MosT distinctly outperforms strong baselines, delivering high-quality, diverse solutions that profile the entire Pareto frontier, thus ensuring balanced trade-offs across many objectives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04099v1-abstract-full').style.display = 'none'; document.getElementById('2403.04099v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.03329">arXiv:2403.03329</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.03329">pdf</a>, <a href="https://arxiv.org/format/2403.03329">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Guardrail Baselines for Unlearning in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Thaker%2C+P">Pratiksha Thaker</a>, <a href="/search/cs?searchtype=author&amp;query=Maurya%2C+Y">Yash Maurya</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengyuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z+S">Zhiwei Steven Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.03329v3-abstract-short" style="display: inline;"> Recent work has demonstrated that finetuning is a promising approach to &#39;unlearn&#39; concepts from large language models. However, finetuning can be expensive, as it requires both generating a set of examples and running iterations of finetuning to update the model. In this work, we show that simple guardrail-based approaches such as prompting and filtering can achieve unlearning results comparable t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03329v3-abstract-full').style.display = 'inline'; document.getElementById('2403.03329v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.03329v3-abstract-full" style="display: none;"> Recent work has demonstrated that finetuning is a promising approach to &#39;unlearn&#39; concepts from large language models. However, finetuning can be expensive, as it requires both generating a set of examples and running iterations of finetuning to update the model. In this work, we show that simple guardrail-based approaches such as prompting and filtering can achieve unlearning results comparable to finetuning. We recommend that researchers investigate these lightweight baselines when evaluating the performance of more computationally intensive finetuning methods. While we do not claim that methods such as prompting or filtering are universal solutions to the problem of unlearning, our work suggests the need for evaluation metrics that can better separate the power of guardrails vs. finetuning, and highlights scenarios where guardrails expose possible unintended behavior in existing metrics and benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03329v3-abstract-full').style.display = 'none'; document.getElementById('2403.03329v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preliminary work, accepted to ICLR workshop SeT-LLM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.16187">arXiv:2402.16187</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.16187">pdf</a>, <a href="https://arxiv.org/format/2402.16187">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> No Free Lunch in LLM Watermarking: Trade-offs in Watermarking Design Choices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pang%2C+Q">Qi Pang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengyuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+W">Wenting Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.16187v3-abstract-short" style="display: inline;"> Advances in generative models have made it possible for AI-generated text, code, and images to mirror human-generated content in many applications. Watermarking, a technique that aims to embed information in the output of a model to verify its source, is useful for mitigating the misuse of such AI-generated content. However, we show that common design choices in LLM watermarking schemes make the r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16187v3-abstract-full').style.display = 'inline'; document.getElementById('2402.16187v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.16187v3-abstract-full" style="display: none;"> Advances in generative models have made it possible for AI-generated text, code, and images to mirror human-generated content in many applications. Watermarking, a technique that aims to embed information in the output of a model to verify its source, is useful for mitigating the misuse of such AI-generated content. However, we show that common design choices in LLM watermarking schemes make the resulting systems surprisingly susceptible to attack -- leading to fundamental trade-offs in robustness, utility, and usability. To navigate these trade-offs, we rigorously study a set of simple yet effective attacks on common watermarking systems, and propose guidelines and defenses for LLM watermarking in practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16187v3-abstract-full').style.display = 'none'; document.getElementById('2402.16187v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05406">arXiv:2402.05406</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.05406">pdf</a>, <a href="https://arxiv.org/format/2402.05406">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Everybody Prune Now: Structured Pruning of LLMs with only Forward Passes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dery%2C+L">Lucio Dery</a>, <a href="/search/cs?searchtype=author&amp;query=Kolawole%2C+S">Steven Kolawole</a>, <a href="/search/cs?searchtype=author&amp;query=Kagy%2C+J">Jean-Fran莽ois Kagy</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&amp;query=Talwalkar%2C+A">Ameet Talwalkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05406v2-abstract-short" style="display: inline;"> Given the generational gap in available hardware between lay practitioners and the most endowed institutions, LLMs are becoming increasingly inaccessible as they grow in size. Whilst many approaches have been proposed to compress LLMs to make their resource consumption manageable, these methods themselves tend to be resource intensive, putting them out of the reach of the very user groups they tar&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05406v2-abstract-full').style.display = 'inline'; document.getElementById('2402.05406v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05406v2-abstract-full" style="display: none;"> Given the generational gap in available hardware between lay practitioners and the most endowed institutions, LLMs are becoming increasingly inaccessible as they grow in size. Whilst many approaches have been proposed to compress LLMs to make their resource consumption manageable, these methods themselves tend to be resource intensive, putting them out of the reach of the very user groups they target. In this work, we explore the problem of structured pruning of LLMs using only forward passes. We seek to empower practitioners to prune models so large that their available hardware has just enough memory to run inference. We develop Bonsai, a gradient-free, perturbative pruning method capable of delivering small, fast, and accurate pruned models. We observe that Bonsai outputs pruned models that (i) outperform those generated by more expensive gradient-based structured pruning methods, and (ii) are twice as fast (with comparable accuracy) as those generated by semi-structured pruning methods requiring comparable resources as Bonsai. We also leverage Bonsai to produce a new sub-2B model using a single A6000 that yields state-of-the-art performance on 4/6 tasks on the Huggingface Open LLM leaderboard. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05406v2-abstract-full').style.display = 'none'; document.getElementById('2402.05406v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 4 fiigures, 15 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15551">arXiv:2312.15551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.15551">pdf</a>, <a href="https://arxiv.org/format/2312.15551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On the Benefits of Public Representations for Private Transfer Learning under Distribution Shift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Thaker%2C+P">Pratiksha Thaker</a>, <a href="/search/cs?searchtype=author&amp;query=Setlur%2C+A">Amrith Setlur</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z+S">Zhiwei Steven Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15551v4-abstract-short" style="display: inline;"> Public pretraining is a promising approach to improve differentially private model training. However, recent work has noted that many positive research results studying this paradigm only consider in-distribution tasks, and may not apply to settings where there is distribution shift between the pretraining and finetuning data -- a scenario that is likely when finetuning private tasks due to the se&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15551v4-abstract-full').style.display = 'inline'; document.getElementById('2312.15551v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15551v4-abstract-full" style="display: none;"> Public pretraining is a promising approach to improve differentially private model training. However, recent work has noted that many positive research results studying this paradigm only consider in-distribution tasks, and may not apply to settings where there is distribution shift between the pretraining and finetuning data -- a scenario that is likely when finetuning private tasks due to the sensitive nature of the data. In this work, we show empirically across three tasks that even in settings with large distribution shift, where both zero-shot performance from public data and training from scratch with private data give unusably weak results, public features can in fact improve private training accuracy by up to 67\% over private training from scratch. We provide a theoretical explanation for this phenomenon, showing that if the public and private data share a low-dimensional representation, public representations can improve the sample complexity of private training even if it is impossible to learn the private task from the public data alone. Altogether, our results provide evidence that public data can indeed make private training practical in realistic settings of extreme distribution shift. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15551v4-abstract-full').style.display = 'none'; document.getElementById('2312.15551v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03318">arXiv:2312.03318</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.03318">pdf</a>, <a href="https://arxiv.org/format/2312.03318">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Complementary Benefits of Contrastive Learning and Self-Training Under Distribution Shift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Garg%2C+S">Saurabh Garg</a>, <a href="/search/cs?searchtype=author&amp;query=Setlur%2C+A">Amrith Setlur</a>, <a href="/search/cs?searchtype=author&amp;query=Lipton%2C+Z+C">Zachary Chase Lipton</a>, <a href="/search/cs?searchtype=author&amp;query=Balakrishnan%2C+S">Sivaraman Balakrishnan</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Raghunathan%2C+A">Aditi Raghunathan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03318v1-abstract-short" style="display: inline;"> Self-training and contrastive learning have emerged as leading techniques for incorporating unlabeled data, both under distribution shift (unsupervised domain adaptation) and when it is absent (semi-supervised learning). However, despite the popularity and compatibility of these techniques, their efficacy in combination remains unexplored. In this paper, we undertake a systematic empirical investi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03318v1-abstract-full').style.display = 'inline'; document.getElementById('2312.03318v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03318v1-abstract-full" style="display: none;"> Self-training and contrastive learning have emerged as leading techniques for incorporating unlabeled data, both under distribution shift (unsupervised domain adaptation) and when it is absent (semi-supervised learning). However, despite the popularity and compatibility of these techniques, their efficacy in combination remains unexplored. In this paper, we undertake a systematic empirical investigation of this combination, finding that (i) in domain adaptation settings, self-training and contrastive learning offer significant complementary gains; and (ii) in semi-supervised learning settings, surprisingly, the benefits are not synergistic. Across eight distribution shift datasets (e.g., BREEDs, WILDS), we demonstrate that the combined method obtains 3--8% higher accuracy than either approach independently. We then theoretically analyze these techniques in a simplified model of distribution shift, demonstrating scenarios under which the features produced by contrastive learning can yield a good initialization for self-training to further amplify gains and achieve optimal performance, even when either method alone would fail. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03318v1-abstract-full').style.display = 'none'; document.getElementById('2312.03318v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.01424">arXiv:2310.01424</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.01424">pdf</a>, <a href="https://arxiv.org/ps/2310.01424">ps</a>, <a href="https://arxiv.org/format/2310.01424">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Identifying and Mitigating Privacy Risks Stemming from Language Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Victoria Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Shamsabadi%2C+A+S">Ali Shahin Shamsabadi</a>, <a href="/search/cs?searchtype=author&amp;query=Ashurst%2C+C">Carolyn Ashurst</a>, <a href="/search/cs?searchtype=author&amp;query=Weller%2C+A">Adrian Weller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.01424v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown greatly enhanced performance in recent years, attributed to increased size and extensive training data. This advancement has led to widespread interest and adoption across industries and the public. However, training data memorization in Machine Learning models scales with model size, particularly concerning for LLMs. Memorized text sequences have the potent&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01424v2-abstract-full').style.display = 'inline'; document.getElementById('2310.01424v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.01424v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown greatly enhanced performance in recent years, attributed to increased size and extensive training data. This advancement has led to widespread interest and adoption across industries and the public. However, training data memorization in Machine Learning models scales with model size, particularly concerning for LLMs. Memorized text sequences have the potential to be directly leaked from LLMs, posing a serious threat to data privacy. Various techniques have been developed to attack LLMs and extract their training data. As these models continue to grow, this issue becomes increasingly critical. To help researchers and policymakers understand the state of knowledge around privacy attacks and mitigations, including where more work is needed, we present the first SoK on data privacy for LLMs. We (i) identify a taxonomy of salient dimensions where attacks differ on LLMs, (ii) systematize existing attacks, using our taxonomy of dimensions to highlight key trends, (iii) survey existing mitigation strategies, highlighting their strengths and limitations, and (iv) identify key gaps, demonstrating open problems and areas for concern. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01424v2-abstract-full').style.display = 'none'; document.getElementById('2310.01424v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.12180">arXiv:2304.12180</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.12180">pdf</a>, <a href="https://arxiv.org/format/2304.12180">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Variance-Reduced Gradient Estimation via Noise-Reuse in Online Evolution Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+O">Oscar Li</a>, <a href="/search/cs?searchtype=author&amp;query=Harrison%2C+J">James Harrison</a>, <a href="/search/cs?searchtype=author&amp;query=Sohl-Dickstein%2C+J">Jascha Sohl-Dickstein</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Metz%2C+L">Luke Metz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.12180v2-abstract-short" style="display: inline;"> Unrolled computation graphs are prevalent throughout machine learning but present challenges to automatic differentiation (AD) gradient estimation methods when their loss functions exhibit extreme local sensitivtiy, discontinuity, or blackbox characteristics. In such scenarios, online evolution strategies methods are a more capable alternative, while being more parallelizable than vanilla evolutio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.12180v2-abstract-full').style.display = 'inline'; document.getElementById('2304.12180v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.12180v2-abstract-full" style="display: none;"> Unrolled computation graphs are prevalent throughout machine learning but present challenges to automatic differentiation (AD) gradient estimation methods when their loss functions exhibit extreme local sensitivtiy, discontinuity, or blackbox characteristics. In such scenarios, online evolution strategies methods are a more capable alternative, while being more parallelizable than vanilla evolution strategies (ES) by interleaving partial unrolls and gradient updates. In this work, we propose a general class of unbiased online evolution strategies methods. We analytically and empirically characterize the variance of this class of gradient estimators and identify the one with the least variance, which we term Noise-Reuse Evolution Strategies (NRES). Experimentally, we show NRES results in faster convergence than existing AD and ES methods in terms of wall-clock time and number of unroll steps across a variety of applications, including learning dynamical systems, meta-training learned optimizers, and reinforcement learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.12180v2-abstract-full').style.display = 'none'; document.getElementById('2304.12180v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2023. 41 pages. Code available at https://github.com/OscarcarLi/Noise-Reuse-Evolution-Strategies</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.10093">arXiv:2302.10093</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.10093">pdf</a>, <a href="https://arxiv.org/format/2302.10093">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Progressive Ensemble Distillation: Building Ensembles for Efficient Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dennis%2C+D+K">Don Kurian Dennis</a>, <a href="/search/cs?searchtype=author&amp;query=Shetty%2C+A">Abhishek Shetty</a>, <a href="/search/cs?searchtype=author&amp;query=Sevekari%2C+A">Anish Sevekari</a>, <a href="/search/cs?searchtype=author&amp;query=Koishida%2C+K">Kazuhito Koishida</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.10093v2-abstract-short" style="display: inline;"> We study the problem of progressive ensemble distillation: Given a large, pretrained teacher model $g$, we seek to decompose the model into smaller, low-inference cost student models $f_i$, such that progressively evaluating additional models in this ensemble leads to improved predictions. The resulting ensemble allows for flexibly tuning accuracy vs. inference cost at runtime, which is useful for&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10093v2-abstract-full').style.display = 'inline'; document.getElementById('2302.10093v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.10093v2-abstract-full" style="display: none;"> We study the problem of progressive ensemble distillation: Given a large, pretrained teacher model $g$, we seek to decompose the model into smaller, low-inference cost student models $f_i$, such that progressively evaluating additional models in this ensemble leads to improved predictions. The resulting ensemble allows for flexibly tuning accuracy vs. inference cost at runtime, which is useful for a number of applications in on-device inference. The method we propose, B-DISTIL , relies on an algorithmic procedure that uses function composition over intermediate activations to construct expressive ensembles with similar performance as $g$ , but with smaller student models. We demonstrate the effectiveness of B-DISTIL by decomposing pretrained models across standard image, speech, and sensor datasets. We also provide theoretical guarantees in terms of convergence and generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10093v2-abstract-full').style.display = 'none'; document.getElementById('2302.10093v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.08533">arXiv:2302.08533</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.08533">pdf</a>, <a href="https://arxiv.org/format/2302.08533">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Federated Learning as a Network Effects Game </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengyuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Ngo%2C+D+D">Dung Daniel Ngo</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+S">Shuran Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z+S">Zhiwei Steven Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.08533v1-abstract-short" style="display: inline;"> Federated Learning (FL) aims to foster collaboration among a population of clients to improve the accuracy of machine learning without directly sharing local data. Although there has been rich literature on designing federated learning algorithms, most prior works implicitly assume that all clients are willing to participate in a FL scheme. In practice, clients may not benefit from joining in FL,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.08533v1-abstract-full').style.display = 'inline'; document.getElementById('2302.08533v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.08533v1-abstract-full" style="display: none;"> Federated Learning (FL) aims to foster collaboration among a population of clients to improve the accuracy of machine learning without directly sharing local data. Although there has been rich literature on designing federated learning algorithms, most prior works implicitly assume that all clients are willing to participate in a FL scheme. In practice, clients may not benefit from joining in FL, especially in light of potential costs related to issues such as privacy and computation. In this work, we study the clients&#39; incentives in federated learning to help the service provider design better solutions and ensure clients make better decisions. We are the first to model clients&#39; behaviors in FL as a network effects game, where each client&#39;s benefit depends on other clients who also join the network. Using this setup we analyze the dynamics of clients&#39; participation and characterize the equilibrium, where no client has incentives to alter their decision. Specifically, we show that dynamics in the population naturally converge to equilibrium without needing explicit interventions. Finally, we provide a cost-efficient payment scheme that incentivizes clients to reach a desired equilibrium when the initial network is empty. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.08533v1-abstract-full').style.display = 'none'; document.getElementById('2302.08533v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages of main text, 26 pages in total</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.02931">arXiv:2302.02931</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.02931">pdf</a>, <a href="https://arxiv.org/format/2302.02931">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bitrate-Constrained DRO: Beyond Worst Case Robustness To Unknown Group Shifts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Setlur%2C+A">Amrith Setlur</a>, <a href="/search/cs?searchtype=author&amp;query=Dennis%2C+D">Don Dennis</a>, <a href="/search/cs?searchtype=author&amp;query=Eysenbach%2C+B">Benjamin Eysenbach</a>, <a href="/search/cs?searchtype=author&amp;query=Raghunathan%2C+A">Aditi Raghunathan</a>, <a href="/search/cs?searchtype=author&amp;query=Finn%2C+C">Chelsea Finn</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Levine%2C+S">Sergey Levine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.02931v2-abstract-short" style="display: inline;"> Training machine learning models robust to distribution shifts is critical for real-world applications. Some robust training algorithms (e.g., Group DRO) specialize to group shifts and require group information on all training points. Other methods (e.g., CVaR DRO) that do not need group annotations can be overly conservative, since they naively upweight high loss points which may form a contrived&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.02931v2-abstract-full').style.display = 'inline'; document.getElementById('2302.02931v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.02931v2-abstract-full" style="display: none;"> Training machine learning models robust to distribution shifts is critical for real-world applications. Some robust training algorithms (e.g., Group DRO) specialize to group shifts and require group information on all training points. Other methods (e.g., CVaR DRO) that do not need group annotations can be overly conservative, since they naively upweight high loss points which may form a contrived set that does not correspond to any meaningful group in the real world (e.g., when the high loss points are randomly mislabeled training points). In this work, we address limitations in prior approaches by assuming a more nuanced form of group shift: conditioned on the label, we assume that the true group function (indicator over group) is simple. For example, we may expect that group shifts occur along low bitrate features (e.g., image background, lighting). Thus, we aim to learn a model that maintains high accuracy on simple group functions realized by these low bitrate features, that need not spend valuable model capacity achieving high accuracy on contrived groups of examples. Based on this, we consider the two-player game formulation of DRO where the adversary&#39;s capacity is bitrate-constrained. Our resulting practical algorithm, Bitrate-Constrained DRO (BR-DRO), does not require group information on training samples yet matches the performance of Group DRO on datasets that have training group annotations and that of CVaR DRO on long-tailed distributions. Our theoretical analysis reveals that in some settings BR-DRO objective can provably yield statistically efficient and less conservative solutions than unconstrained CVaR DRO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.02931v2-abstract-full').style.display = 'none'; document.getElementById('2302.02931v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICLR 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08930">arXiv:2212.08930</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.08930">pdf</a>, <a href="https://arxiv.org/format/2212.08930">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On Noisy Evaluation in Federated Hyperparameter Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kuo%2C+K">Kevin Kuo</a>, <a href="/search/cs?searchtype=author&amp;query=Thaker%2C+P">Pratiksha Thaker</a>, <a href="/search/cs?searchtype=author&amp;query=Khodak%2C+M">Mikhail Khodak</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+J">John Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Daniel Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Talwalkar%2C+A">Ameet Talwalkar</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08930v4-abstract-short" style="display: inline;"> Hyperparameter tuning is critical to the success of federated learning applications. Unfortunately, appropriately selecting hyperparameters is challenging in federated networks. Issues of scale, privacy, and heterogeneity introduce noise in the tuning process and make it difficult to evaluate the performance of various hyperparameters. In this work, we perform the first systematic study on the eff&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08930v4-abstract-full').style.display = 'inline'; document.getElementById('2212.08930v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08930v4-abstract-full" style="display: none;"> Hyperparameter tuning is critical to the success of federated learning applications. Unfortunately, appropriately selecting hyperparameters is challenging in federated networks. Issues of scale, privacy, and heterogeneity introduce noise in the tuning process and make it difficult to evaluate the performance of various hyperparameters. In this work, we perform the first systematic study on the effect of noisy evaluation in federated hyperparameter tuning. We first identify and rigorously explore key sources of noise, including client subsampling, data and systems heterogeneity, and data privacy. Surprisingly, our results indicate that even small amounts of noise can significantly impact tuning methods-reducing the performance of state-of-the-art approaches to that of naive baselines. To address noisy evaluation in such scenarios, we propose a simple and effective approach that leverages public proxy data to boost the evaluation signal. Our work establishes general challenges, baselines, and best practices for future work in federated hyperparameter tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08930v4-abstract-full').style.display = 'none'; document.getElementById('2212.08930v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v1: 19 pages, 15 figures, submitted to MLSys2023; v2: Fixed citation formatting; v3: Fixed typo, update acks v4: MLSys2023 camera-ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.00309">arXiv:2212.00309</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.00309">pdf</a>, <a href="https://arxiv.org/format/2212.00309">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Differentially Private Adaptive Optimization with Delayed Preconditioners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zaheer%2C+M">Manzil Zaheer</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+K+Z">Ken Ziyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Reddi%2C+S+J">Sashank J. Reddi</a>, <a href="/search/cs?searchtype=author&amp;query=McMahan%2C+H+B">H. Brendan McMahan</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.00309v2-abstract-short" style="display: inline;"> Privacy noise may negate the benefits of using adaptive optimizers in differentially private model training. Prior works typically address this issue by using auxiliary information (e.g., public data) to boost the effectiveness of adaptive optimization. In this work, we explore techniques to estimate and efficiently adapt to gradient geometry in private adaptive optimization without auxiliary data&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00309v2-abstract-full').style.display = 'inline'; document.getElementById('2212.00309v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.00309v2-abstract-full" style="display: none;"> Privacy noise may negate the benefits of using adaptive optimizers in differentially private model training. Prior works typically address this issue by using auxiliary information (e.g., public data) to boost the effectiveness of adaptive optimization. In this work, we explore techniques to estimate and efficiently adapt to gradient geometry in private adaptive optimization without auxiliary data. Motivated by the observation that adaptive methods can tolerate stale preconditioners, we propose differentially private adaptive training with delayed preconditioners (DP^2), a simple method that constructs delayed but less noisy preconditioners to better realize the benefits of adaptivity. Theoretically, we provide convergence guarantees for our method for both convex and non-convex problems, and analyze trade-offs between delay and privacy noise reduction. Empirically, we explore DP^2 across several real-world datasets, demonstrating that it can improve convergence speed by as much as 4x relative to non-adaptive baselines and match the performance of state-of-the-art optimization methods that require auxiliary data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00309v2-abstract-full').style.display = 'none'; document.getElementById('2212.00309v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.15458">arXiv:2211.15458</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.15458">pdf</a>, <a href="https://arxiv.org/format/2211.15458">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Validating Large Language Models with ReLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kuchnik%2C+M">Michael Kuchnik</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Amvrosiadis%2C+G">George Amvrosiadis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.15458v2-abstract-short" style="display: inline;"> Although large language models (LLMs) have been touted for their ability to generate natural-sounding text, there are growing concerns around possible negative effects of LLMs such as data memorization, bias, and inappropriate language. Unfortunately, the complexity and generation capacities of LLMs make validating (and correcting) such concerns difficult. In this work, we introduce ReLM, a system&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.15458v2-abstract-full').style.display = 'inline'; document.getElementById('2211.15458v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.15458v2-abstract-full" style="display: none;"> Although large language models (LLMs) have been touted for their ability to generate natural-sounding text, there are growing concerns around possible negative effects of LLMs such as data memorization, bias, and inappropriate language. Unfortunately, the complexity and generation capacities of LLMs make validating (and correcting) such concerns difficult. In this work, we introduce ReLM, a system for validating and querying LLMs using standard regular expressions. ReLM formalizes and enables a broad range of language model evaluations, reducing complex evaluation rules to simple regular expression queries. Our results exploring queries surrounding memorization, gender bias, toxicity, and language understanding show that ReLM achieves up to 15x higher system efficiency, 2.5x data efficiency, and increased statistical and prompt-tuning coverage compared to state-of-the-art ad-hoc queries. ReLM offers a competitive and general baseline for the increasingly important problem of LLM validation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.15458v2-abstract-full').style.display = 'none'; document.getElementById('2211.15458v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.00467">arXiv:2208.00467</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.00467">pdf</a>, <a href="https://arxiv.org/format/2208.00467">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3550316">10.1145/3550316 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> COCOA: Cross Modality Contrastive Learning for Sensor Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deldari%2C+S">Shohreh Deldari</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+H">Hao Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Saeed%2C+A">Aaqib Saeed</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+D+V">Daniel V. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Salim%2C+F+D">Flora D. Salim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.00467v2-abstract-short" style="display: inline;"> Self-Supervised Learning (SSL) is a new paradigm for learning discriminative representations without labelled data and has reached comparable or even state-of-the-art results in comparison to supervised counterparts. Contrastive Learning (CL) is one of the most well-known approaches in SSL that attempts to learn general, informative representations of data. CL methods have been mostly developed fo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00467v2-abstract-full').style.display = 'inline'; document.getElementById('2208.00467v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.00467v2-abstract-full" style="display: none;"> Self-Supervised Learning (SSL) is a new paradigm for learning discriminative representations without labelled data and has reached comparable or even state-of-the-art results in comparison to supervised counterparts. Contrastive Learning (CL) is one of the most well-known approaches in SSL that attempts to learn general, informative representations of data. CL methods have been mostly developed for applications in computer vision and natural language processing where only a single sensor modality is used. A majority of pervasive computing applications, however, exploit data from a range of different sensor modalities. While existing CL methods are limited to learning from one or two data sources, we propose COCOA (Cross mOdality COntrastive leArning), a self-supervised model that employs a novel objective function to learn quality representations from multisensor data by computing the cross-correlation between different data modalities and minimizing the similarity between irrelevant instances. We evaluate the effectiveness of COCOA against eight recently introduced state-of-the-art self-supervised models, and two supervised baselines across five public datasets. We show that COCOA achieves superior classification performance to all other approaches. Also, COCOA is far more label-efficient than the other baselines including the fully supervised model using only one-tenth of available labelled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00467v2-abstract-full').style.display = 'none'; document.getElementById('2208.00467v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 10 figures, 6 tables, Accepted with minor revision at IMWUT Vol. 6 No. 3</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.09262">arXiv:2206.09262</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.09262">pdf</a>, <a href="https://arxiv.org/format/2206.09262">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Motley: Benchmarking Heterogeneity and Personalization in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+S">Shanshan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Charles%2C+Z">Zachary Charles</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Y">Yu Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zheng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.09262v6-abstract-short" style="display: inline;"> Personalized federated learning considers learning models unique to each client in a heterogeneous network. The resulting client-specific models have been purported to improve metrics such as accuracy, fairness, and robustness in federated networks. However, despite a plethora of work in this area, it remains unclear: (1) which personalization techniques are most effective in various settings, and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.09262v6-abstract-full').style.display = 'inline'; document.getElementById('2206.09262v6-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.09262v6-abstract-full" style="display: none;"> Personalized federated learning considers learning models unique to each client in a heterogeneous network. The resulting client-specific models have been purported to improve metrics such as accuracy, fairness, and robustness in federated networks. However, despite a plethora of work in this area, it remains unclear: (1) which personalization techniques are most effective in various settings, and (2) how important personalization truly is for realistic federated applications. To better answer these questions, we propose Motley, a benchmark for personalized federated learning. Motley consists of a suite of cross-device and cross-silo federated datasets from varied problem domains, as well as thorough evaluation metrics for better understanding the possible impacts of personalization. We establish baselines on the benchmark by comparing a number of representative personalized federated learning methods. These initial results highlight strengths and weaknesses of existing approaches, and raise several open questions for the community. Motley aims to provide a reproducible means with which to advance developments in personalized and heterogeneity-aware federated learning, as well as the related areas of transfer learning, meta-learning, and multi-task learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.09262v6-abstract-full').style.display = 'none'; document.getElementById('2206.09262v6-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">40 pages, 10 figures, 7 tables. EMNIST and Landmarks fine-tuning results are corrected in (and after) v5. Code: https://github.com/google-research/federated/tree/master/personalization_benchmark</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.07902">arXiv:2206.07902</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.07902">pdf</a>, <a href="https://arxiv.org/format/2206.07902">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On Privacy and Personalization in Cross-Silo Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengyuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z+S">Zhiwei Steven Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.07902v2-abstract-short" style="display: inline;"> While the application of differential privacy (DP) has been well-studied in cross-device federated learning (FL), there is a lack of work considering DP and its implications for cross-silo FL, a setting characterized by a limited number of clients each containing many data subjects. In cross-silo FL, usual notions of client-level DP are less suitable as real-world privacy regulations typically con&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07902v2-abstract-full').style.display = 'inline'; document.getElementById('2206.07902v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.07902v2-abstract-full" style="display: none;"> While the application of differential privacy (DP) has been well-studied in cross-device federated learning (FL), there is a lack of work considering DP and its implications for cross-silo FL, a setting characterized by a limited number of clients each containing many data subjects. In cross-silo FL, usual notions of client-level DP are less suitable as real-world privacy regulations typically concern the in-silo data subjects rather than the silos themselves. In this work, we instead consider an alternative notion of silo-specific sample-level DP, where silos set their own privacy targets for their local examples. Under this setting, we reconsider the roles of personalization in federated learning. In particular, we show that mean-regularized multi-task learning (MR-MTL), a simple personalization framework, is a strong baseline for cross-silo FL: under stronger privacy requirements, silos are incentivized to federate more with each other to mitigate DP noise, resulting in consistent improvements relative to standard baseline methods. We provide an empirical study of competing methods as well as a theoretical characterization of MR-MTL for mean estimation, highlighting the interplay between privacy and cross-silo data heterogeneity. Our work serves to establish baselines for private cross-silo FL as well as identify key directions of future work in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07902v2-abstract-full').style.display = 'none'; document.getElementById('2206.07902v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2022, 37 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.02353">arXiv:2206.02353</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.02353">pdf</a>, <a href="https://arxiv.org/format/2206.02353">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Just Vision: A Review on Self-Supervised Representation Learning on Multimodal and Temporal Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deldari%2C+S">Shohreh Deldari</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+H">Hao Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Saeed%2C+A">Aaqib Saeed</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jiayuan He</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+D+V">Daniel V. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Salim%2C+F+D">Flora D. Salim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.02353v2-abstract-short" style="display: inline;"> Recently, Self-Supervised Representation Learning (SSRL) has attracted much attention in the field of computer vision, speech, natural language processing (NLP), and recently, with other types of modalities, including time series from sensors. The popularity of self-supervised learning is driven by the fact that traditional models typically require a huge amount of well-annotated data for training&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.02353v2-abstract-full').style.display = 'inline'; document.getElementById('2206.02353v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.02353v2-abstract-full" style="display: none;"> Recently, Self-Supervised Representation Learning (SSRL) has attracted much attention in the field of computer vision, speech, natural language processing (NLP), and recently, with other types of modalities, including time series from sensors. The popularity of self-supervised learning is driven by the fact that traditional models typically require a huge amount of well-annotated data for training. Acquiring annotated data can be a difficult and costly process. Self-supervised methods have been introduced to improve the efficiency of training data through discriminative pre-training of models using supervisory signals that have been freely obtained from the raw data. Unlike existing reviews of SSRL that have pre-dominately focused upon methods in the fields of CV or NLP for a single modality, we aim to provide the first comprehensive review of multimodal self-supervised learning methods for temporal data. To this end, we 1) provide a comprehensive categorization of existing SSRL methods, 2) introduce a generic pipeline by defining the key components of a SSRL framework, 3) compare existing models in terms of their objective function, network architecture and potential applications, and 4) review existing multimodal techniques in each category and various modalities. Finally, we present existing weaknesses and future opportunities. We believe our work develops a perspective on the requirements of SSRL in domains that utilise multimodal and/or temporal data <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.02353v2-abstract-full').style.display = 'none'; document.getElementById('2206.02353v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 pages, 5 figures, 9 tables, Survey paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.01367">arXiv:2206.01367</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.01367">pdf</a>, <a href="https://arxiv.org/format/2206.01367">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Unlearning: Reducing Confidence Along Adversarial Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Setlur%2C+A">Amrith Setlur</a>, <a href="/search/cs?searchtype=author&amp;query=Eysenbach%2C+B">Benjamin Eysenbach</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Levine%2C+S">Sergey Levine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.01367v1-abstract-short" style="display: inline;"> Supervised learning methods trained with maximum likelihood objectives often overfit on training data. Most regularizers that prevent overfitting look to increase confidence on additional examples (e.g., data augmentation, adversarial training), or reduce it on training data (e.g., label smoothing). In this work we propose a complementary regularization strategy that reduces confidence on self-gen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.01367v1-abstract-full').style.display = 'inline'; document.getElementById('2206.01367v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.01367v1-abstract-full" style="display: none;"> Supervised learning methods trained with maximum likelihood objectives often overfit on training data. Most regularizers that prevent overfitting look to increase confidence on additional examples (e.g., data augmentation, adversarial training), or reduce it on training data (e.g., label smoothing). In this work we propose a complementary regularization strategy that reduces confidence on self-generated examples. The method, which we call RCAD (Reducing Confidence along Adversarial Directions), aims to reduce confidence on out-of-distribution examples lying along directions adversarially chosen to increase training loss. In contrast to adversarial training, RCAD does not try to robustify the model to output the original label, but rather regularizes it to have reduced confidence on points generated using much larger perturbations than in conventional adversarial training. RCAD can be easily integrated into training pipelines with a few lines of code. Despite its simplicity, we find on many classification benchmarks that RCAD can be added to existing techniques (e.g., label smoothing, MixUp training) to increase test accuracy by 1-3% in absolute value, with more significant gains in the low data regime. We also provide a theoretical analysis that helps to explain these benefits in simplified settings, showing that RCAD can provably help the model unlearn spurious features in the training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.01367v1-abstract-full').style.display = 'none'; document.getElementById('2206.01367v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.14840">arXiv:2205.14840</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.14840">pdf</a>, <a href="https://arxiv.org/format/2205.14840">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Maximizing Global Model Appeal in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cho%2C+Y+J">Yae Jee Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Jhunjhunwala%2C+D">Divyansh Jhunjhunwala</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+G">Gauri Joshi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.14840v2-abstract-short" style="display: inline;"> Federated learning typically considers collaboratively training a global model using local data at edge clients. Clients may have their own individual requirements, such as having a minimal training loss threshold, which they expect to be met by the global model. However, due to client heterogeneity, the global model may not meet each client&#39;s requirements, and only a small subset may find the glo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14840v2-abstract-full').style.display = 'inline'; document.getElementById('2205.14840v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.14840v2-abstract-full" style="display: none;"> Federated learning typically considers collaboratively training a global model using local data at edge clients. Clients may have their own individual requirements, such as having a minimal training loss threshold, which they expect to be met by the global model. However, due to client heterogeneity, the global model may not meet each client&#39;s requirements, and only a small subset may find the global model appealing. In this work, we explore the problem of the global model lacking appeal to the clients due to not being able to satisfy local requirements. We propose MaxFL, which aims to maximize the number of clients that find the global model appealing. We show that having a high global model appeal is important to maintain an adequate pool of clients for training, and can directly improve the test accuracy on both seen and unseen clients. We provide convergence guarantees for MaxFL and show that MaxFL achieves a $22$-$40\%$ and $18$-$50\%$ test accuracy improvement for the training clients and unseen clients respectively, compared to a wide range of FL modeling approaches, including those that tackle data heterogeneity, aim to incentivize clients, and learn personalized or fair models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14840v2-abstract-full').style.display = 'none'; document.getElementById('2205.14840v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.10190">arXiv:2203.10190</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.10190">pdf</a>, <a href="https://arxiv.org/format/2203.10190">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Fair Federated Learning via Bounded Group Loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengyuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z+S">Zhiwei Steven Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.10190v3-abstract-short" style="display: inline;"> Fair prediction across protected groups is an important constraint for many federated learning applications. However, prior work studying group fair federated learning lacks formal convergence or fairness guarantees. In this work we propose a general framework for provably fair federated learning. In particular, we explore and extend the notion of Bounded Group Loss as a theoretically-grounded app&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.10190v3-abstract-full').style.display = 'inline'; document.getElementById('2203.10190v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.10190v3-abstract-full" style="display: none;"> Fair prediction across protected groups is an important constraint for many federated learning applications. However, prior work studying group fair federated learning lacks formal convergence or fairness guarantees. In this work we propose a general framework for provably fair federated learning. In particular, we explore and extend the notion of Bounded Group Loss as a theoretically-grounded approach for group fairness. Using this setup, we propose a scalable federated optimization method that optimizes the empirical risk under a number of group fairness constraints. We provide convergence guarantees for the method as well as fairness guarantees for the resulting solution. Empirically, we evaluate our method across common benchmarks from fair ML and federated learning, showing that it can provide both fairer and more accurate predictions than baseline approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.10190v3-abstract-full').style.display = 'none'; document.getElementById('2203.10190v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.05963">arXiv:2202.05963</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.05963">pdf</a>, <a href="https://arxiv.org/format/2202.05963">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Private Adaptive Optimization with Side Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zaheer%2C+M">Manzil Zaheer</a>, <a href="/search/cs?searchtype=author&amp;query=Reddi%2C+S+J">Sashank J. Reddi</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.05963v2-abstract-short" style="display: inline;"> Adaptive optimization methods have become the default solvers for many machine learning tasks. Unfortunately, the benefits of adaptivity may degrade when training with differential privacy, as the noise added to ensure privacy reduces the effectiveness of the adaptive preconditioner. To this end, we propose AdaDPS, a general framework that uses non-sensitive side information to precondition the gr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.05963v2-abstract-full').style.display = 'inline'; document.getElementById('2202.05963v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.05963v2-abstract-full" style="display: none;"> Adaptive optimization methods have become the default solvers for many machine learning tasks. Unfortunately, the benefits of adaptivity may degrade when training with differential privacy, as the noise added to ensure privacy reduces the effectiveness of the adaptive preconditioner. To this end, we propose AdaDPS, a general framework that uses non-sensitive side information to precondition the gradients, allowing the effective use of adaptive methods in private settings. We formally show AdaDPS reduces the amount of noise needed to achieve similar privacy guarantees, thereby improving optimization performance. Empirically, we leverage simple and readily available side information to explore the performance of AdaDPS in practice, comparing to strong baselines in both centralized and federated settings. Our results show that AdaDPS improves accuracy by 7.7% (absolute) on average -- yielding state-of-the-art privacy-utility trade-offs on large-scale text and image benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.05963v2-abstract-full').style.display = 'none'; document.getElementById('2202.05963v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.04131">arXiv:2111.04131</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.04131">pdf</a>, <a href="https://arxiv.org/format/2111.04131">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Plumber: Diagnosing and Removing Performance Bottlenecks in Machine Learning Data Pipelines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kuchnik%2C+M">Michael Kuchnik</a>, <a href="/search/cs?searchtype=author&amp;query=Klimovic%2C+A">Ana Klimovic</a>, <a href="/search/cs?searchtype=author&amp;query=Simsa%2C+J">Jiri Simsa</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Amvrosiadis%2C+G">George Amvrosiadis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.04131v2-abstract-short" style="display: inline;"> Input pipelines, which ingest and transform input data, are an essential part of training Machine Learning (ML) models. However, it is challenging to implement efficient input pipelines, as it requires reasoning about parallelism, asynchrony, and variability in fine-grained profiling information. Our analysis of over two million ML jobs in Google datacenters reveals that a significant fraction of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.04131v2-abstract-full').style.display = 'inline'; document.getElementById('2111.04131v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.04131v2-abstract-full" style="display: none;"> Input pipelines, which ingest and transform input data, are an essential part of training Machine Learning (ML) models. However, it is challenging to implement efficient input pipelines, as it requires reasoning about parallelism, asynchrony, and variability in fine-grained profiling information. Our analysis of over two million ML jobs in Google datacenters reveals that a significant fraction of model training jobs could benefit from faster input data pipelines. At the same time, our analysis indicates that most jobs do not saturate host hardware, pointing in the direction of software-based bottlenecks. Motivated by these findings, we propose Plumber, a tool for finding bottlenecks in ML input pipelines. Plumber uses an extensible and interpretable operational analysis analytical model to automatically tune parallelism, prefetching, and caching under host resource constraints. Across five representative ML pipelines, Plumber obtains speedups of up to 47x for misconfigured pipelines. By automating caching, Plumber obtains end-to-end speedups of over 50% compared to state-of-the-art tuners. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.04131v2-abstract-full').style.display = 'none'; document.getElementById('2111.04131v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.06141">arXiv:2109.06141</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.06141">pdf</a>, <a href="https://arxiv.org/format/2109.06141">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On Tilted Losses in Machine Learning: Theory and Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Beirami%2C+A">Ahmad Beirami</a>, <a href="/search/cs?searchtype=author&amp;query=Sanjabi%2C+M">Maziar Sanjabi</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.06141v3-abstract-short" style="display: inline;"> Exponential tilting is a technique commonly used in fields such as statistics, probability, information theory, and optimization to create parametric distribution shifts. Despite its prevalence in related fields, tilting has not seen widespread use in machine learning. In this work, we aim to bridge this gap by exploring the use of tilting in risk minimization. We study a simple extension to ERM -&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.06141v3-abstract-full').style.display = 'inline'; document.getElementById('2109.06141v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.06141v3-abstract-full" style="display: none;"> Exponential tilting is a technique commonly used in fields such as statistics, probability, information theory, and optimization to create parametric distribution shifts. Despite its prevalence in related fields, tilting has not seen widespread use in machine learning. In this work, we aim to bridge this gap by exploring the use of tilting in risk minimization. We study a simple extension to ERM -- tilted empirical risk minimization (TERM) -- which uses exponential tilting to flexibly tune the impact of individual losses. The resulting framework has several useful properties: We show that TERM can increase or decrease the influence of outliers, respectively, to enable fairness or robustness; has variance-reduction properties that can benefit generalization; and can be viewed as a smooth approximation to the tail probability of losses. Our work makes rigorous connections between TERM and related objectives, such as Value-at-Risk, Conditional Value-at-Risk, and distributionally robust optimization (DRO). We develop batch and stochastic first-order optimization methods for solving TERM, provide convergence guarantees for the solvers, and show that the framework can be efficiently solved relative to common alternatives. Finally, we demonstrate that TERM can be used for a multitude of applications in machine learning, such as enforcing fairness between subgroups, mitigating the effect of outliers, and handling class imbalance. Despite the straightforward modification TERM makes to traditional ERM objectives, we find that the framework can consistently outperform ERM and deliver competitive performance with state-of-the-art, problem-specific approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.06141v3-abstract-full').style.display = 'none'; document.getElementById('2109.06141v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2007.01162</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.12978">arXiv:2108.12978</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.12978">pdf</a>, <a href="https://arxiv.org/format/2108.12978">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Private Multi-Task Learning: Formulation and Applications to Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengyuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z+S">Zhiwei Steven Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.12978v3-abstract-short" style="display: inline;"> Many problems in machine learning rely on multi-task learning (MTL), in which the goal is to solve multiple related machine learning tasks simultaneously. MTL is particularly relevant for privacy-sensitive applications in areas such as healthcare, finance, and IoT computing, where sensitive data from multiple, varied sources are shared for the purpose of learning. In this work, we formalize notion&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.12978v3-abstract-full').style.display = 'inline'; document.getElementById('2108.12978v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.12978v3-abstract-full" style="display: none;"> Many problems in machine learning rely on multi-task learning (MTL), in which the goal is to solve multiple related machine learning tasks simultaneously. MTL is particularly relevant for privacy-sensitive applications in areas such as healthcare, finance, and IoT computing, where sensitive data from multiple, varied sources are shared for the purpose of learning. In this work, we formalize notions of client-level privacy for MTL via joint differential privacy (JDP), a relaxation of differential privacy for mechanism design and distributed optimization. We then propose an algorithm for mean-regularized MTL, an objective commonly used for applications in personalized federated learning, subject to JDP. We analyze our objective and solver, providing certifiable guarantees on both privacy and utility. Empirically, we find that our method provides improved privacy/utility trade-offs relative to global baselines across common federated learning benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.12978v3-abstract-full').style.display = 'none'; document.getElementById('2108.12978v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to TMLR. Transactions on Machine Learning Research (2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.06917">arXiv:2107.06917</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.06917">pdf</a>, <a href="https://arxiv.org/format/2107.06917">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Field Guide to Federated Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jianyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Charles%2C+Z">Zachary Charles</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zheng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+G">Gauri Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=McMahan%2C+H+B">H. Brendan McMahan</a>, <a href="/search/cs?searchtype=author&amp;query=Arcas%2C+B+A+y">Blaise Aguera y Arcas</a>, <a href="/search/cs?searchtype=author&amp;query=Al-Shedivat%2C+M">Maruan Al-Shedivat</a>, <a href="/search/cs?searchtype=author&amp;query=Andrew%2C+G">Galen Andrew</a>, <a href="/search/cs?searchtype=author&amp;query=Avestimehr%2C+S">Salman Avestimehr</a>, <a href="/search/cs?searchtype=author&amp;query=Daly%2C+K">Katharine Daly</a>, <a href="/search/cs?searchtype=author&amp;query=Data%2C+D">Deepesh Data</a>, <a href="/search/cs?searchtype=author&amp;query=Diggavi%2C+S">Suhas Diggavi</a>, <a href="/search/cs?searchtype=author&amp;query=Eichner%2C+H">Hubert Eichner</a>, <a href="/search/cs?searchtype=author&amp;query=Gadhikar%2C+A">Advait Gadhikar</a>, <a href="/search/cs?searchtype=author&amp;query=Garrett%2C+Z">Zachary Garrett</a>, <a href="/search/cs?searchtype=author&amp;query=Girgis%2C+A+M">Antonious M. Girgis</a>, <a href="/search/cs?searchtype=author&amp;query=Hanzely%2C+F">Filip Hanzely</a>, <a href="/search/cs?searchtype=author&amp;query=Hard%2C+A">Andrew Hard</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+C">Chaoyang He</a>, <a href="/search/cs?searchtype=author&amp;query=Horvath%2C+S">Samuel Horvath</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Z">Zhouyuan Huo</a>, <a href="/search/cs?searchtype=author&amp;query=Ingerman%2C+A">Alex Ingerman</a>, <a href="/search/cs?searchtype=author&amp;query=Jaggi%2C+M">Martin Jaggi</a>, <a href="/search/cs?searchtype=author&amp;query=Javidi%2C+T">Tara Javidi</a>, <a href="/search/cs?searchtype=author&amp;query=Kairouz%2C+P">Peter Kairouz</a> , et al. (28 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.06917v1-abstract-short" style="display: inline;"> Federated learning and analytics are a distributed approach for collaboratively learning models (or statistics) from decentralized data, motivated by and designed for privacy protection. The distributed learning process can be formulated as solving federated optimization problems, which emphasize communication efficiency, data heterogeneity, compatibility with privacy and system requirements, and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.06917v1-abstract-full').style.display = 'inline'; document.getElementById('2107.06917v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.06917v1-abstract-full" style="display: none;"> Federated learning and analytics are a distributed approach for collaboratively learning models (or statistics) from decentralized data, motivated by and designed for privacy protection. The distributed learning process can be formulated as solving federated optimization problems, which emphasize communication efficiency, data heterogeneity, compatibility with privacy and system requirements, and other constraints that are not primary considerations in other problem settings. This paper provides recommendations and guidelines on formulating, designing, evaluating and analyzing federated optimization algorithms through concrete examples and practical implementation, with a focus on conducting effective simulations to infer real-world performance. The goal of this work is not to survey the current literature, but to inspire researchers and practitioners to design federated learning algorithms that can be used in various practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.06917v1-abstract-full').style.display = 'none'; document.getElementById('2107.06917v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.07820">arXiv:2106.07820</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.07820">pdf</a>, <a href="https://arxiv.org/format/2106.07820">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> On Large-Cohort Training for Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Charles%2C+Z">Zachary Charles</a>, <a href="/search/cs?searchtype=author&amp;query=Garrett%2C+Z">Zachary Garrett</a>, <a href="/search/cs?searchtype=author&amp;query=Huo%2C+Z">Zhouyuan Huo</a>, <a href="/search/cs?searchtype=author&amp;query=Shmulyian%2C+S">Sergei Shmulyian</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.07820v1-abstract-short" style="display: inline;"> Federated learning methods typically learn a model by iteratively sampling updates from a population of clients. In this work, we explore how the number of clients sampled at each round (the cohort size) impacts the quality of the learned model and the training dynamics of federated learning algorithms. Our work poses three fundamental questions. First, what challenges arise when trying to scale f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.07820v1-abstract-full').style.display = 'inline'; document.getElementById('2106.07820v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.07820v1-abstract-full" style="display: none;"> Federated learning methods typically learn a model by iteratively sampling updates from a population of clients. In this work, we explore how the number of clients sampled at each round (the cohort size) impacts the quality of the learned model and the training dynamics of federated learning algorithms. Our work poses three fundamental questions. First, what challenges arise when trying to scale federated learning to larger cohorts? Second, what parallels exist between cohort sizes in federated learning and batch sizes in centralized learning? Last, how can we design federated learning methods that effectively utilize larger cohort sizes? We give partial answers to these questions based on extensive empirical evaluation. Our work highlights a number of challenges stemming from the use of larger cohorts. While some of these (such as generalization issues and diminishing returns) are analogs of large-batch training challenges, others (including training failures and fairness concerns) are unique to federated learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.07820v1-abstract-full').style.display = 'none'; document.getElementById('2106.07820v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.04502">arXiv:2106.04502</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.04502">pdf</a>, <a href="https://arxiv.org/format/2106.04502">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Federated Hyperparameter Tuning: Challenges, Baselines, and Connections to Weight-Sharing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Khodak%2C+M">Mikhail Khodak</a>, <a href="/search/cs?searchtype=author&amp;query=Tu%2C+R">Renbo Tu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Liam Li</a>, <a href="/search/cs?searchtype=author&amp;query=Balcan%2C+M">Maria-Florina Balcan</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Talwalkar%2C+A">Ameet Talwalkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.04502v2-abstract-short" style="display: inline;"> Tuning hyperparameters is a crucial but arduous part of the machine learning pipeline. Hyperparameter optimization is even more challenging in federated learning, where models are learned over a distributed network of heterogeneous devices; here, the need to keep data on device and perform local training makes it difficult to efficiently train and evaluate configurations. In this work, we investig&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.04502v2-abstract-full').style.display = 'inline'; document.getElementById('2106.04502v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.04502v2-abstract-full" style="display: none;"> Tuning hyperparameters is a crucial but arduous part of the machine learning pipeline. Hyperparameter optimization is even more challenging in federated learning, where models are learned over a distributed network of heterogeneous devices; here, the need to keep data on device and perform local training makes it difficult to efficiently train and evaluate configurations. In this work, we investigate the problem of federated hyperparameter tuning. We first identify key challenges and show how standard approaches may be adapted to form baselines for the federated setting. Then, by making a novel connection to the neural architecture search technique of weight-sharing, we introduce a new method, FedEx, to accelerate federated hyperparameter tuning that is applicable to widely-used federated optimization methods such as FedAvg and recent variants. Theoretically, we show that a FedEx variant correctly tunes the on-device learning rate in the setting of online convex optimization across devices. Empirically, we show that FedEx can outperform natural baselines for federated hyperparameter tuning by several percentage points on the Shakespeare, FEMNIST, and CIFAR-10 benchmarks, obtaining higher accuracy using the same training budget. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.04502v2-abstract-full').style.display = 'none'; document.getElementById('2106.04502v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.00697">arXiv:2103.00697</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.00697">pdf</a>, <a href="https://arxiv.org/format/2103.00697">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Heterogeneity for the Win: One-Shot Federated Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dennis%2C+D+K">Don Kurian Dennis</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.00697v2-abstract-short" style="display: inline;"> In this work, we explore the unique challenges -- and opportunities -- of unsupervised federated learning (FL). We develop and analyze a one-shot federated clustering scheme, $k$-FED, based on the widely-used Lloyd&#39;s method for $k$-means clustering. In contrast to many supervised problems, we show that the issue of statistical heterogeneity in federated networks can in fact benefit our analysis. W&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.00697v2-abstract-full').style.display = 'inline'; document.getElementById('2103.00697v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.00697v2-abstract-full" style="display: none;"> In this work, we explore the unique challenges -- and opportunities -- of unsupervised federated learning (FL). We develop and analyze a one-shot federated clustering scheme, $k$-FED, based on the widely-used Lloyd&#39;s method for $k$-means clustering. In contrast to many supervised problems, we show that the issue of statistical heterogeneity in federated networks can in fact benefit our analysis. We analyse $k$-FED under a center separation assumption and compare it to the best known requirements of its centralized counterpart. Our analysis shows that in heterogeneous regimes where the number of clusters per device $(k&#39;)$ is smaller than the total number of clusters over the network $k$, $(k&#39;\le \sqrt{k})$, we can use heterogeneity to our advantage -- significantly weakening the cluster separation requirements for $k$-FED. From a practical viewpoint, $k$-FED also has many desirable properties: it requires only round of communication, can run asynchronously, and can handle partial participation or node/network failures. We motivate our analysis with experiments on common FL benchmarks, and highlight the practical utility of one-shot clustering through use-cases in personalized FL and device sampling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.00697v2-abstract-full').style.display = 'none'; document.getElementById('2103.00697v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.11503">arXiv:2102.11503</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.11503">pdf</a>, <a href="https://arxiv.org/format/2102.11503">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Two Sides of Meta-Learning Evaluation: In vs. Out of Distribution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Setlur%2C+A">Amrith Setlur</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+O">Oscar Li</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.11503v3-abstract-short" style="display: inline;"> We categorize meta-learning evaluation into two settings: $\textit{in-distribution}$ [ID], in which the train and test tasks are sampled $\textit{iid}$ from the same underlying task distribution, and $\textit{out-of-distribution}$ [OOD], in which they are not. While most meta-learning theory and some FSL applications follow the ID setting, we identify that most existing few-shot classification ben&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.11503v3-abstract-full').style.display = 'inline'; document.getElementById('2102.11503v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.11503v3-abstract-full" style="display: none;"> We categorize meta-learning evaluation into two settings: $\textit{in-distribution}$ [ID], in which the train and test tasks are sampled $\textit{iid}$ from the same underlying task distribution, and $\textit{out-of-distribution}$ [OOD], in which they are not. While most meta-learning theory and some FSL applications follow the ID setting, we identify that most existing few-shot classification benchmarks instead reflect OOD evaluation, as they use disjoint sets of train (base) and test (novel) classes for task generation. This discrepancy is problematic because -- as we show on numerous benchmarks -- meta-learning methods that perform better on existing OOD datasets may perform significantly worse in the ID setting. In addition, in the OOD setting, even though current FSL benchmarks seem befitting, our study highlights concerns in 1) reliably performing model selection for a given meta-learning method, and 2) consistently comparing the performance of different methods. To address these concerns, we provide suggestions on how to construct FSL benchmarks to allow for ID evaluation as well as more reliable OOD evaluation. Our work aims to inform the meta-learning community about the importance and distinction of ID vs. OOD evaluation, as well as the subtleties of OOD evaluation with current benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.11503v3-abstract-full').style.display = 'none'; document.getElementById('2102.11503v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.08504">arXiv:2102.08504</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.08504">pdf</a>, <a href="https://arxiv.org/format/2102.08504">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Label Leakage and Protection in Two-party Split Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+O">Oscar Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+J">Jiankai Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+W">Weihao Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hongyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+J">Junyuan Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.08504v3-abstract-short" style="display: inline;"> Two-party split learning is a popular technique for learning a model across feature-partitioned data. In this work, we explore whether it is possible for one party to steal the private label information from the other party during split training, and whether there are methods that can protect against such attacks. Specifically, we first formulate a realistic threat model and propose a privacy loss&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.08504v3-abstract-full').style.display = 'inline'; document.getElementById('2102.08504v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.08504v3-abstract-full" style="display: none;"> Two-party split learning is a popular technique for learning a model across feature-partitioned data. In this work, we explore whether it is possible for one party to steal the private label information from the other party during split training, and whether there are methods that can protect against such attacks. Specifically, we first formulate a realistic threat model and propose a privacy loss metric to quantify label leakage in split learning. We then show that there exist two simple yet effective methods within the threat model that can allow one party to accurately recover private ground-truth labels owned by the other party. To combat these attacks, we propose several random perturbation techniques, including $\texttt{Marvell}$, an approach that strategically finds the structure of the noise perturbation by minimizing the amount of label leakage (measured through our quantification metric) of a worst-case adversary. We empirically demonstrate the effectiveness of our protection techniques against the identified attacks, and show that $\texttt{Marvell}$ in particular has improved privacy-utility tradeoffs relative to baseline approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.08504v3-abstract-full').style.display = 'none'; document.getElementById('2102.08504v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2022 (https://openreview.net/forum?id=cOtBRgsf2fO)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.04221">arXiv:2012.04221</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2012.04221">pdf</a>, <a href="https://arxiv.org/format/2012.04221">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Ditto: Fair and Robust Federated Learning Through Personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shengyuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Beirami%2C+A">Ahmad Beirami</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.04221v3-abstract-short" style="display: inline;"> Fairness and robustness are two important concerns for federated learning systems. In this work, we identify that robustness to data and model poisoning attacks and fairness, measured as the uniformity of performance across devices, are competing constraints in statistically heterogeneous networks. To address these constraints, we propose employing a simple, general framework for personalized fede&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.04221v3-abstract-full').style.display = 'inline'; document.getElementById('2012.04221v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.04221v3-abstract-full" style="display: none;"> Fairness and robustness are two important concerns for federated learning systems. In this work, we identify that robustness to data and model poisoning attacks and fairness, measured as the uniformity of performance across devices, are competing constraints in statistically heterogeneous networks. To address these constraints, we propose employing a simple, general framework for personalized federated learning, Ditto, that can inherently provide fairness and robustness benefits, and develop a scalable solver for it. Theoretically, we analyze the ability of Ditto to achieve fairness and robustness simultaneously on a class of linear problems. Empirically, across a suite of federated datasets, we show that Ditto not only achieves competitive performance relative to recent personalization methods, but also enables more accurate, robust, and fair models relative to state-of-the-art fair or robust baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.04221v3-abstract-full').style.display = 'none'; document.getElementById('2012.04221v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.14097">arXiv:2011.14097</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.14097">pdf</a>, <a href="https://arxiv.org/format/2011.14097">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3442381.3449903">10.1145/3442381.3449903 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Time Series Change Point Detection with Self-Supervised Contrastive Predictive Coding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deldari%2C+S">Shohreh Deldari</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+D+V">Daniel V. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+H">Hao Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Salim%2C+F+D">Flora D. Salim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.14097v5-abstract-short" style="display: inline;"> Change Point Detection (CPD) methods identify the times associated with changes in the trends and properties of time series data in order to describe the underlying behaviour of the system. For instance, detecting the changes and anomalies associated with web service usage, application usage or human behaviour can provide valuable insights for downstream modelling tasks. We propose a novel approac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14097v5-abstract-full').style.display = 'inline'; document.getElementById('2011.14097v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.14097v5-abstract-full" style="display: none;"> Change Point Detection (CPD) methods identify the times associated with changes in the trends and properties of time series data in order to describe the underlying behaviour of the system. For instance, detecting the changes and anomalies associated with web service usage, application usage or human behaviour can provide valuable insights for downstream modelling tasks. We propose a novel approach for self-supervised Time Series Change Point detection method based onContrastivePredictive coding (TS-CP^2). TS-CP^2 is the first approach to employ a contrastive learning strategy for CPD by learning an embedded representation that separates pairs of embeddings of time adjacent intervals from pairs of interval embeddings separated across time. Through extensive experiments on three diverse, widely used time series datasets, we demonstrate that our method outperforms five state-of-the-art CPD methods, which include unsupervised and semi-supervisedapproaches. TS-CP^2 is shown to improve the performance of methods that use either handcrafted statistical or temporal features by 79.4% and deep learning-based methods by 17.0% with respect to the F1-score averaged across the three datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14097v5-abstract-full').style.display = 'none'; document.getElementById('2011.14097v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at The WEB Conference 2021 (WWW&#39;21)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.14048">arXiv:2011.14048</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.14048">pdf</a>, <a href="https://arxiv.org/format/2011.14048">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Is Support Set Diversity Necessary for Meta-Learning? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Setlur%2C+A">Amrith Setlur</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+O">Oscar Li</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.14048v2-abstract-short" style="display: inline;"> Meta-learning is a popular framework for learning with limited data in which an algorithm is produced by training over multiple few-shot learning tasks. For classification problems, these tasks are typically constructed by sampling a small number of support and query examples from a subset of the classes. While conventional wisdom is that task diversity should improve the performance of meta-learn&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14048v2-abstract-full').style.display = 'inline'; document.getElementById('2011.14048v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.14048v2-abstract-full" style="display: none;"> Meta-learning is a popular framework for learning with limited data in which an algorithm is produced by training over multiple few-shot learning tasks. For classification problems, these tasks are typically constructed by sampling a small number of support and query examples from a subset of the classes. While conventional wisdom is that task diversity should improve the performance of meta-learning, in this work we find evidence to the contrary: we propose a modification to traditional meta-learning approaches in which we keep the support sets fixed across tasks, thus reducing task diversity. Surprisingly, we find that not only does this modification not result in adverse effects, it almost always improves the performance for a variety of datasets and meta-learning methods. We also provide several initial analyses to understand this phenomenon. Our work serves to: (i) more closely investigate the effect of support set construction for the problem of meta-learning, and (ii) suggest a simple, general, and competitive baseline for few-shot learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14048v2-abstract-full').style.display = 'none'; document.getElementById('2011.14048v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NeurIPS 2020 Workshop on Meta-learning </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.03230">arXiv:2008.03230</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2008.03230">pdf</a>, <a href="https://arxiv.org/format/2008.03230">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3411832">10.1145/3411832 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> ESPRESSO: Entropy and ShaPe awaRe timE-Series SegmentatiOn for processing heterogeneous sensor data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deldari%2C+S">Shohreh Deldari</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+D+V">Daniel V. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Sadri%2C+A">Amin Sadri</a>, <a href="/search/cs?searchtype=author&amp;query=Salim%2C+F+D">Flora D. Salim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.03230v1-abstract-short" style="display: inline;"> Extracting informative and meaningful temporal segments from high-dimensional wearable sensor data, smart devices, or IoT data is a vital preprocessing step in applications such as Human Activity Recognition (HAR), trajectory prediction, gesture recognition, and lifelogging. In this paper, we propose ESPRESSO (Entropy and ShaPe awaRe timE-Series SegmentatiOn), a hybrid segmentation model for multi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.03230v1-abstract-full').style.display = 'inline'; document.getElementById('2008.03230v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.03230v1-abstract-full" style="display: none;"> Extracting informative and meaningful temporal segments from high-dimensional wearable sensor data, smart devices, or IoT data is a vital preprocessing step in applications such as Human Activity Recognition (HAR), trajectory prediction, gesture recognition, and lifelogging. In this paper, we propose ESPRESSO (Entropy and ShaPe awaRe timE-Series SegmentatiOn), a hybrid segmentation model for multi-dimensional time-series that is formulated to exploit the entropy and temporal shape properties of time-series. ESPRESSO differs from existing methods that focus upon particular statistical or temporal properties of time-series exclusively. As part of model development, a novel temporal representation of time-series $WCAC$ was introduced along with a greedy search approach that estimate segments based upon the entropy metric. ESPRESSO was shown to offer superior performance to four state-of-the-art methods across seven public datasets of wearable and wear-free sensing. In addition, we undertake a deeper investigation of these datasets to understand how ESPRESSO and its constituent methods perform with respect to different dataset characteristics. Finally, we provide two interesting case-studies to show how applying ESPRESSO can assist in inferring daily activity routines and the emotional state of humans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.03230v1-abstract-full').style.display = 'none'; document.getElementById('2008.03230v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 11 figures, accepted at IMWUT Volume(4) issue(3)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.01162">arXiv:2007.01162</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.01162">pdf</a>, <a href="https://arxiv.org/format/2007.01162">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Tilted Empirical Risk Minimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Beirami%2C+A">Ahmad Beirami</a>, <a href="/search/cs?searchtype=author&amp;query=Sanjabi%2C+M">Maziar Sanjabi</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.01162v2-abstract-short" style="display: inline;"> Empirical risk minimization (ERM) is typically designed to perform well on the average loss, which can result in estimators that are sensitive to outliers, generalize poorly, or treat subgroups unfairly. While many methods aim to address these problems individually, in this work, we explore them through a unified framework -- tilted empirical risk minimization (TERM). In particular, we show that i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.01162v2-abstract-full').style.display = 'inline'; document.getElementById('2007.01162v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.01162v2-abstract-full" style="display: none;"> Empirical risk minimization (ERM) is typically designed to perform well on the average loss, which can result in estimators that are sensitive to outliers, generalize poorly, or treat subgroups unfairly. While many methods aim to address these problems individually, in this work, we explore them through a unified framework -- tilted empirical risk minimization (TERM). In particular, we show that it is possible to flexibly tune the impact of individual losses through a straightforward extension to ERM using a hyperparameter called the tilt. We provide several interpretations of the resulting framework: We show that TERM can increase or decrease the influence of outliers, respectively, to enable fairness or robustness; has variance-reduction properties that can benefit generalization; and can be viewed as a smooth approximation to a superquantile method. We develop batch and stochastic first-order optimization methods for solving TERM, and show that the problem can be efficiently solved relative to common alternatives. Finally, we demonstrate that TERM can be used for a multitude of applications, such as enforcing fairness between subgroups, mitigating the effect of outliers, and handling class imbalance. TERM is not only competitive with existing solutions tailored to these individual problems, but can also enable entirely new applications, such as simultaneously addressing outliers and promoting fairness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.01162v2-abstract-full').style.display = 'none'; document.getElementById('2007.01162v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.01920">arXiv:2001.01920</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2001.01920">pdf</a>, <a href="https://arxiv.org/format/2001.01920">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> FedDANE: A Federated Newton-Type Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sahu%2C+A+K">Anit Kumar Sahu</a>, <a href="/search/cs?searchtype=author&amp;query=Zaheer%2C+M">Manzil Zaheer</a>, <a href="/search/cs?searchtype=author&amp;query=Sanjabi%2C+M">Maziar Sanjabi</a>, <a href="/search/cs?searchtype=author&amp;query=Talwalkar%2C+A">Ameet Talwalkar</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.01920v1-abstract-short" style="display: inline;"> Federated learning aims to jointly learn statistical models over massively distributed remote devices. In this work, we propose FedDANE, an optimization method that we adapt from DANE, a method for classical distributed optimization, to handle the practical constraints of federated learning. We provide convergence guarantees for this method when learning over both convex and non-convex functions.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.01920v1-abstract-full').style.display = 'inline'; document.getElementById('2001.01920v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.01920v1-abstract-full" style="display: none;"> Federated learning aims to jointly learn statistical models over massively distributed remote devices. In this work, we propose FedDANE, an optimization method that we adapt from DANE, a method for classical distributed optimization, to handle the practical constraints of federated learning. We provide convergence guarantees for this method when learning over both convex and non-convex functions. Despite encouraging theoretical results, we find that the method has underwhelming performance empirically. In particular, through empirical simulations on both synthetic and real-world datasets, FedDANE consistently underperforms baselines of FedAvg and FedProx in realistic federated settings. We identify low device participation and statistical device heterogeneity as two underlying causes of this underwhelming performance, and conclude by suggesting several directions of future work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.01920v1-abstract-full').style.display = 'none'; document.getElementById('2001.01920v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Asilomar Conference on Signals, Systems, and Computers 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.01812">arXiv:1911.01812</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1911.01812">pdf</a>, <a href="https://arxiv.org/format/1911.01812">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Enhancing the Privacy of Federated Learning with Sketching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zaoxing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Sekar%2C+V">Vyas Sekar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.01812v1-abstract-short" style="display: inline;"> In response to growing concerns about user privacy, federated learning has emerged as a promising tool to train statistical models over networks of devices while keeping data localized. Federated learning methods run training tasks directly on user devices and do not share the raw user data with third parties. However, current methods still share model updates, which may contain private informatio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.01812v1-abstract-full').style.display = 'inline'; document.getElementById('1911.01812v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.01812v1-abstract-full" style="display: none;"> In response to growing concerns about user privacy, federated learning has emerged as a promising tool to train statistical models over networks of devices while keeping data localized. Federated learning methods run training tasks directly on user devices and do not share the raw user data with third parties. However, current methods still share model updates, which may contain private information (e.g., one&#39;s weight and height), during the training process. Existing efforts that aim to improve the privacy of federated learning make compromises in one or more of the following key areas: performance (particularly communication cost), accuracy, or privacy. To better optimize these trade-offs, we propose that \textit{sketching algorithms} have a unique advantage in that they can provide both privacy and performance benefits while maintaining accuracy. We evaluate the feasibility of sketching-based federated learning with a prototype on three representative learning models. Our initial findings show that it is possible to provide strong privacy guarantees for federated learning without sacrificing performance or accuracy. Our work highlights that there exists a fundamental connection between privacy and communication in distributed settings, and suggests important open problems surrounding the theoretical understanding, methodology, and system design of practical, private federated learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.01812v1-abstract-full').style.display = 'none'; document.getElementById('1911.01812v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.00972">arXiv:1911.00972</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1911.00972">pdf</a>, <a href="https://arxiv.org/format/1911.00972">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Privacy for Free: Communication-Efficient Learning with Differential Privacy Using Sketches </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zaoxing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sekar%2C+V">Vyas Sekar</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.00972v2-abstract-short" style="display: inline;"> Communication and privacy are two critical concerns in distributed learning. Many existing works treat these concerns separately. In this work, we argue that a natural connection exists between methods for communication reduction and privacy preservation in the context of distributed machine learning. In particular, we prove that Count Sketch, a simple method for data stream summarization, has inh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.00972v2-abstract-full').style.display = 'inline'; document.getElementById('1911.00972v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.00972v2-abstract-full" style="display: none;"> Communication and privacy are two critical concerns in distributed learning. Many existing works treat these concerns separately. In this work, we argue that a natural connection exists between methods for communication reduction and privacy preservation in the context of distributed machine learning. In particular, we prove that Count Sketch, a simple method for data stream summarization, has inherent differential privacy properties. Using these derived privacy guarantees, we propose a novel sketch-based framework (DiffSketch) for distributed learning, where we compress the transmitted messages via sketches to simultaneously achieve communication efficiency and provable privacy benefits. Our evaluation demonstrates that DiffSketch can provide strong differential privacy guarantees (e.g., $\varepsilon$= 1) and reduce communication by 20-50x with only marginal decreases in accuracy. Compared to baselines that treat privacy and communication separately, DiffSketch improves absolute test accuracy by 5%-50% while offering the same privacy guarantees and communication compression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.00972v2-abstract-full').style.display = 'none'; document.getElementById('1911.00972v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.00472">arXiv:1911.00472</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1911.00472">pdf</a>, <a href="https://arxiv.org/format/1911.00472">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Progressive Compressed Records: Taking a Byte out of Deep Learning Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kuchnik%2C+M">Michael Kuchnik</a>, <a href="/search/cs?searchtype=author&amp;query=Amvrosiadis%2C+G">George Amvrosiadis</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.00472v4-abstract-short" style="display: inline;"> Deep learning accelerators efficiently train over vast and growing amounts of data, placing a newfound burden on commodity networks and storage devices. A common approach to conserve bandwidth involves resizing or compressing data prior to training. We introduce Progressive Compressed Records (PCRs), a data format that uses compression to reduce the overhead of fetching and transporting data, effe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.00472v4-abstract-full').style.display = 'inline'; document.getElementById('1911.00472v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.00472v4-abstract-full" style="display: none;"> Deep learning accelerators efficiently train over vast and growing amounts of data, placing a newfound burden on commodity networks and storage devices. A common approach to conserve bandwidth involves resizing or compressing data prior to training. We introduce Progressive Compressed Records (PCRs), a data format that uses compression to reduce the overhead of fetching and transporting data, effectively reducing the training time required to achieve a target accuracy. PCRs deviate from previous storage formats by combining progressive compression with an efficient storage layout to view a single dataset at multiple fidelities---all without adding to the total dataset size. We implement PCRs and evaluate them on a range of datasets, training tasks, and hardware architectures. Our work shows that: (i) the amount of compression a dataset can tolerate exceeds 50% of the original encoding for many DL training tasks; (ii) it is possible to automatically and efficiently select appropriate compression levels for a given task; and (iii) PCRs enable tasks to readily access compressed data at runtime---utilizing as little as half the training bandwidth and thus potentially doubling training speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.00472v4-abstract-full').style.display = 'none'; document.getElementById('1911.00472v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.07873">arXiv:1908.07873</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1908.07873">pdf</a>, <a href="https://arxiv.org/format/1908.07873">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/MSP.2020.2975749">10.1109/MSP.2020.2975749 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Federated Learning: Challenges, Methods, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sahu%2C+A+K">Anit Kumar Sahu</a>, <a href="/search/cs?searchtype=author&amp;query=Talwalkar%2C+A">Ameet Talwalkar</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+V">Virginia Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.07873v1-abstract-short" style="display: inline;"> Federated learning involves training statistical models over remote devices or siloed data centers, such as mobile phones or hospitals, while keeping data localized. Training in heterogeneous and potentially massive networks introduces novel challenges that require a fundamental departure from standard approaches for large-scale machine learning, distributed optimization, and privacy-preserving da&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.07873v1-abstract-full').style.display = 'inline'; document.getElementById('1908.07873v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.07873v1-abstract-full" style="display: none;"> Federated learning involves training statistical models over remote devices or siloed data centers, such as mobile phones or hospitals, while keeping data localized. Training in heterogeneous and potentially massive networks introduces novel challenges that require a fundamental departure from standard approaches for large-scale machine learning, distributed optimization, and privacy-preserving data analysis. In this article, we discuss the unique characteristics and challenges of federated learning, provide a broad overview of current approaches, and outline several directions of future work that are relevant to a wide range of research communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.07873v1-abstract-full').style.display = 'none'; document.getElementById('1908.07873v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Smith%2C+V&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Smith%2C+V&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Smith%2C+V&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10