Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 88 results for author: <span class="mathjax">Steeg, G V</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Steeg%2C+G+V">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Steeg, G V"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Steeg%2C+G+V&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Steeg, G V"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Steeg%2C+G+V&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Steeg%2C+G+V&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Steeg%2C+G+V&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15435">arXiv:2501.15435</a> <span> [<a href="https://arxiv.org/pdf/2501.15435">pdf</a>, <a href="https://arxiv.org/format/2501.15435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Making Sense Of Distributed Representations With Activation Spectroscopy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Reing%2C+K">Kyle Reing</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15435v1-abstract-short" style="display: inline;"> In the study of neural network interpretability, there is growing evidence to suggest that relevant features are encoded across many neurons in a distributed fashion. Making sense of these distributed representations without knowledge of the network's encoding strategy is a combinatorial task that is not guaranteed to be tractable. This work explores one feasible path to both detecting and tracing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15435v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15435v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15435v1-abstract-full" style="display: none;"> In the study of neural network interpretability, there is growing evidence to suggest that relevant features are encoded across many neurons in a distributed fashion. Making sense of these distributed representations without knowledge of the network's encoding strategy is a combinatorial task that is not guaranteed to be tractable. This work explores one feasible path to both detecting and tracing the joint influence of neurons in a distributed representation. We term this approach Activation Spectroscopy (ActSpec), owing to its analysis of the pseudo-Boolean Fourier spectrum defined over the activation patterns of a network layer. The sub-network defined between a given layer and an output logit is cast as a special class of pseudo-Boolean function. The contributions of each subset of neurons in the specified layer can be quantified through the function's Fourier coefficients. We propose a combinatorial optimization procedure to search for Fourier coefficients that are simultaneously high-valued, and non-redundant. This procedure can be viewed as an extension of the Goldreich-Levin algorithm which incorporates additional problem-specific constraints. The resulting coefficients specify a collection of subsets, which are used to test the degree to which a representation is distributed. We verify our approach in a number of synthetic settings and compare against existing interpretability benchmarks. We conclude with a number of experimental evaluations on an MNIST classifier, and a transformer-based network for sentiment analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15435v1-abstract-full').style.display = 'none'; document.getElementById('2501.15435v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05855">arXiv:2411.05855</a> <span> [<a href="https://arxiv.org/pdf/2411.05855">pdf</a>, <a href="https://arxiv.org/format/2411.05855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Learning Morphisms with Gauss-Newton Approximation for Growing Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lawton%2C+N">Neal Lawton</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05855v1-abstract-short" style="display: inline;"> A popular method for Neural Architecture Search (NAS) is based on growing networks via small local changes to the network's architecture called network morphisms. These methods start with a small seed network and progressively grow the network by adding new neurons in an automated way. However, it remains a challenge to efficiently determine which parts of the network are best to grow. Here we pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05855v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05855v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05855v1-abstract-full" style="display: none;"> A popular method for Neural Architecture Search (NAS) is based on growing networks via small local changes to the network's architecture called network morphisms. These methods start with a small seed network and progressively grow the network by adding new neurons in an automated way. However, it remains a challenge to efficiently determine which parts of the network are best to grow. Here we propose a NAS method for growing a network by using a Gauss-Newton approximation of the loss function to efficiently learn and evaluate candidate network morphisms. We compare our method with state of the art NAS methods for CIFAR-10 and CIFAR-100 classification tasks, and conclude our method learns similar quality or better architectures at a smaller computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05855v1-abstract-full').style.display = 'none'; document.getElementById('2411.05855v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.8 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21553">arXiv:2410.21553</a> <span> [<a href="https://arxiv.org/pdf/2410.21553">pdf</a>, <a href="https://arxiv.org/format/2410.21553">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Design Space of Diffusion Bridge Models via Stochasticity Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaorong Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuanbin Cheng</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+X">Xianghao Kong</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21553v1-abstract-short" style="display: inline;"> Diffusion bridge models effectively facilitate image-to-image (I2I) translation by connecting two distributions. However, existing methods overlook the impact of noise in sampling SDEs, transition kernel, and the base distribution on sampling efficiency, image quality and diversity. To address this gap, we propose the Stochasticity-controlled Diffusion Bridge (SDB), a novel theoretical framework t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21553v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21553v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21553v1-abstract-full" style="display: none;"> Diffusion bridge models effectively facilitate image-to-image (I2I) translation by connecting two distributions. However, existing methods overlook the impact of noise in sampling SDEs, transition kernel, and the base distribution on sampling efficiency, image quality and diversity. To address this gap, we propose the Stochasticity-controlled Diffusion Bridge (SDB), a novel theoretical framework that extends the design space of diffusion bridges, and provides strategies to mitigate singularities during both training and sampling. By controlling stochasticity in the sampling SDEs, our sampler achieves speeds up to 5 times faster than the baseline, while also producing lower FID scores. After training, SDB sets new benchmarks in image quality and sampling efficiency via managing stochasticity within the transition kernel. Furthermore, introducing stochasticity into the base distribution significantly improves image diversity, as quantified by a newly introduced metric. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21553v1-abstract-full').style.display = 'none'; document.getElementById('2410.21553v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14713">arXiv:2410.14713</a> <span> [<a href="https://arxiv.org/pdf/2410.14713">pdf</a>, <a href="https://arxiv.org/format/2410.14713">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> QuAILoRA: Quantization-Aware Initialization for LoRA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lawton%2C+N">Neal Lawton</a>, <a href="/search/cs?searchtype=author&query=Padmakumar%2C+A">Aishwarya Padmakumar</a>, <a href="/search/cs?searchtype=author&query=Gaspers%2C+J">Judith Gaspers</a>, <a href="/search/cs?searchtype=author&query=FitzGerald%2C+J">Jack FitzGerald</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Anoop Kumar</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14713v1-abstract-short" style="display: inline;"> QLoRA reduces the memory-cost of fine-tuning a large language model (LLM) with LoRA by quantizing the base LLM. However, quantization introduces quantization errors that negatively impact model performance after fine-tuning. In this paper we introduce QuAILoRA, a quantization-aware initialization for LoRA that mitigates this negative impact by decreasing quantization errors at initialization. Our… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14713v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14713v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14713v1-abstract-full" style="display: none;"> QLoRA reduces the memory-cost of fine-tuning a large language model (LLM) with LoRA by quantizing the base LLM. However, quantization introduces quantization errors that negatively impact model performance after fine-tuning. In this paper we introduce QuAILoRA, a quantization-aware initialization for LoRA that mitigates this negative impact by decreasing quantization errors at initialization. Our method spends a small amount of computational overhead to compute this quantization-aware initialization, without increasing the memory-cost of fine-tuning. We evaluate our method on several causal language modeling and downstream evaluation tasks using several different model sizes and families. We observe that almost all LLMs fined-tuned with QuAILoRA achieve better validation perplexity. When evaluated on downstream tasks, we find that QuAILoRA yields improvements proportional to the negative effect of quantization error. On average, applying QuAILoRA to 4-bit QLoRA models yields 75% of the validation perplexity decrease and 86% of the downstream task accuracy increase as doubling the quantization precision to 8-bit, without increasing GPU memory utilization during fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14713v1-abstract-full').style.display = 'none'; document.getElementById('2410.14713v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 7 figures. Submitted to the 4th NeurIPS Workshop on Efficient Natural Language and Speech Processing (ENLSP-IV)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08946">arXiv:2407.08946</a> <span> [<a href="https://arxiv.org/pdf/2407.08946">pdf</a>, <a href="https://arxiv.org/format/2407.08946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Your Diffusion Model is Secretly a Noise Classifier and Benefits from Contrastive Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yunshu Wu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yingtao Luo</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+X">Xianghao Kong</a>, <a href="/search/cs?searchtype=author&query=Papalexakis%2C+E+E">Evangelos E. Papalexakis</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08946v2-abstract-short" style="display: inline;"> Diffusion models learn to denoise data and the trained denoiser is then used to generate new samples from the data distribution. In this paper, we revisit the diffusion sampling process and identify a fundamental cause of sample quality degradation: the denoiser is poorly estimated in regions that are far Outside Of the training Distribution (OOD), and the sampling process inevitably evaluates in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08946v2-abstract-full').style.display = 'inline'; document.getElementById('2407.08946v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08946v2-abstract-full" style="display: none;"> Diffusion models learn to denoise data and the trained denoiser is then used to generate new samples from the data distribution. In this paper, we revisit the diffusion sampling process and identify a fundamental cause of sample quality degradation: the denoiser is poorly estimated in regions that are far Outside Of the training Distribution (OOD), and the sampling process inevitably evaluates in these OOD regions. This can become problematic for all sampling methods, especially when we move to parallel sampling which requires us to initialize and update the entire sample trajectory of dynamics in parallel, leading to many OOD evaluations. To address this problem, we introduce a new self-supervised training objective that differentiates the levels of noise added to a sample, leading to improved OOD denoising performance. The approach is based on our observation that diffusion models implicitly define a log-likelihood ratio that distinguishes distributions with different amounts of noise, and this expression depends on denoiser performance outside the standard training distribution. We show by diverse experiments that the proposed contrastive diffusion training is effective for both sequential and parallel settings, and it improves the performance and speed of parallel samplers significantly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08946v2-abstract-full').style.display = 'none'; document.getElementById('2407.08946v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.15833">arXiv:2402.15833</a> <span> [<a href="https://arxiv.org/pdf/2402.15833">pdf</a>, <a href="https://arxiv.org/format/2402.15833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Prompt Perturbation Consistency Learning for Robust Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiang%2C+Y">Yao Qiang</a>, <a href="/search/cs?searchtype=author&query=Nandi%2C+S">Subhrangshu Nandi</a>, <a href="/search/cs?searchtype=author&query=Mehrabi%2C+N">Ninareh Mehrabi</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Anoop Kumar</a>, <a href="/search/cs?searchtype=author&query=Rumshisky%2C+A">Anna Rumshisky</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.15833v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated impressive performance on a number of natural language processing tasks, such as question answering and text summarization. However, their performance on sequence labeling tasks such as intent classification and slot filling (IC-SF), which is a central component in personal assistant systems, lags significantly behind discriminative models. Furthermor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15833v1-abstract-full').style.display = 'inline'; document.getElementById('2402.15833v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.15833v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated impressive performance on a number of natural language processing tasks, such as question answering and text summarization. However, their performance on sequence labeling tasks such as intent classification and slot filling (IC-SF), which is a central component in personal assistant systems, lags significantly behind discriminative models. Furthermore, there is a lack of substantive research on the robustness of LLMs to various perturbations in the input prompts. The contributions of this paper are three-fold. First, we show that fine-tuning sufficiently large LLMs can produce IC-SF performance comparable to discriminative models. Next, we systematically analyze the performance deterioration of those fine-tuned models due to three distinct yet relevant types of input perturbations - oronyms, synonyms, and paraphrasing. Finally, we propose an efficient mitigation approach, Prompt Perturbation Consistency Learning (PPCL), which works by regularizing the divergence between losses from clean and perturbed samples. Our experiments demonstrate that PPCL can recover on average 59% and 69% of the performance drop for IC and SF tasks, respectively. Furthermore, PPCL beats the data augmentation approach while using ten times fewer augmented data samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15833v1-abstract-full').style.display = 'none'; document.getElementById('2402.15833v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.08919">arXiv:2402.08919</a> <span> [<a href="https://arxiv.org/pdf/2402.08919">pdf</a>, <a href="https://arxiv.org/format/2402.08919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Interpretable Measures of Conceptual Similarity by Complexity-Constrained Descriptive Auto-Encoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Achille%2C+A">Alessandro Achille</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T+Y">Tian Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Trager%2C+M">Matthew Trager</a>, <a href="/search/cs?searchtype=author&query=Klingenberg%2C+C">Carson Klingenberg</a>, <a href="/search/cs?searchtype=author&query=Soatto%2C+S">Stefano Soatto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.08919v1-abstract-short" style="display: inline;"> Quantifying the degree of similarity between images is a key copyright issue for image-based machine learning. In legal doctrine however, determining the degree of similarity between works requires subjective analysis, and fact-finders (judges and juries) can demonstrate considerable variability in these subjective judgement calls. Images that are structurally similar can be deemed dissimilar, whe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08919v1-abstract-full').style.display = 'inline'; document.getElementById('2402.08919v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.08919v1-abstract-full" style="display: none;"> Quantifying the degree of similarity between images is a key copyright issue for image-based machine learning. In legal doctrine however, determining the degree of similarity between works requires subjective analysis, and fact-finders (judges and juries) can demonstrate considerable variability in these subjective judgement calls. Images that are structurally similar can be deemed dissimilar, whereas images of completely different scenes can be deemed similar enough to support a claim of copying. We seek to define and compute a notion of "conceptual similarity" among images that captures high-level relations even among images that do not share repeated elements or visually similar components. The idea is to use a base multi-modal model to generate "explanations" (captions) of visual data at increasing levels of complexity. Then, similarity can be measured by the length of the caption needed to discriminate between the two images: Two highly dissimilar images can be discriminated early in their description, whereas conceptually dissimilar ones will need more detail to be distinguished. We operationalize this definition and show that it correlates with subjective (averaged human evaluation) assessment, and beats existing baselines on both image-to-image and text-to-text similarity benchmarks. Beyond just providing a number, our method also offers interpretability by pointing to the specific level of granularity of the description where the source data are differentiated. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08919v1-abstract-full').style.display = 'none'; document.getElementById('2402.08919v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.14440">arXiv:2312.14440</a> <span> [<a href="https://arxiv.org/pdf/2312.14440">pdf</a>, <a href="https://arxiv.org/format/2312.14440">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Asymmetric Bias in Text-to-Image Generation with Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shahgir%2C+H+S">Haz Sameen Shahgir</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+X">Xianghao Kong</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yue Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.14440v3-abstract-short" style="display: inline;"> The widespread use of Text-to-Image (T2I) models in content generation requires careful examination of their safety, including their robustness to adversarial attacks. Despite extensive research on adversarial attacks, the reasons for their effectiveness remain underexplored. This paper presents an empirical study on adversarial attacks against T2I models, focusing on analyzing factors associated… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.14440v3-abstract-full').style.display = 'inline'; document.getElementById('2312.14440v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.14440v3-abstract-full" style="display: none;"> The widespread use of Text-to-Image (T2I) models in content generation requires careful examination of their safety, including their robustness to adversarial attacks. Despite extensive research on adversarial attacks, the reasons for their effectiveness remain underexplored. This paper presents an empirical study on adversarial attacks against T2I models, focusing on analyzing factors associated with attack success rates (ASR). We introduce a new attack objective - entity swapping using adversarial suffixes and two gradient-based attack algorithms. Human and automatic evaluations reveal the asymmetric nature of ASRs on entity swap: for example, it is easier to replace "human" with "robot" in the prompt "a human dancing in the rain." with an adversarial suffix, but the reverse replacement is significantly harder. We further propose probing metrics to establish indicative signals from the model's beliefs to the adversarial ASR. We identify conditions that result in a success probability of 60% for adversarial attacks and others where this likelihood drops below 5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.14440v3-abstract-full').style.display = 'none'; document.getElementById('2312.14440v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">camera-ready version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.07972">arXiv:2310.07972</a> <span> [<a href="https://arxiv.org/pdf/2310.07972">pdf</a>, <a href="https://arxiv.org/format/2310.07972">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Interpretable Diffusion via Information Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+X">Xianghao Kong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+O">Ollie Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Han Li</a>, <a href="/search/cs?searchtype=author&query=Yogatama%2C+D">Dani Yogatama</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.07972v3-abstract-short" style="display: inline;"> Denoising diffusion models enable conditional generation and density modeling of complex relationships like images and text. However, the nature of the learned relationships is opaque making it difficult to understand precisely what relationships between words and parts of an image are captured, or to predict the effect of an intervention. We illuminate the fine-grained relationships learned by di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07972v3-abstract-full').style.display = 'inline'; document.getElementById('2310.07972v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.07972v3-abstract-full" style="display: none;"> Denoising diffusion models enable conditional generation and density modeling of complex relationships like images and text. However, the nature of the learned relationships is opaque making it difficult to understand precisely what relationships between words and parts of an image are captured, or to predict the effect of an intervention. We illuminate the fine-grained relationships learned by diffusion models by noticing a precise relationship between diffusion and information decomposition. Exact expressions for mutual information and conditional mutual information can be written in terms of the denoising model. Furthermore, pointwise estimates can be easily estimated as well, allowing us to ask questions about the relationships between specific images and captions. Decomposing information even further to understand which variables in a high-dimensional space carry information is a long-standing problem. For diffusion models, we show that a natural non-negative decomposition of mutual information emerges, allowing us to quantify informative relationships between words and pixels in an image. We exploit these new relations to measure the compositional understanding of diffusion models, to do unsupervised localization of objects in images, and to measure effects when selectively editing images through prompt interventions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07972v3-abstract-full').style.display = 'none'; document.getElementById('2310.07972v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, 18 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.09520">arXiv:2306.09520</a> <span> [<a href="https://arxiv.org/pdf/2306.09520">pdf</a>, <a href="https://arxiv.org/format/2306.09520">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Ensembled Prediction Intervals for Causal Outcomes Under Hidden Confounding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Marmarelis%2C+M+G">Myrl G. Marmarelis</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Morstatter%2C+F">Fred Morstatter</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.09520v2-abstract-short" style="display: inline;"> Causal inference of exact individual treatment outcomes in the presence of hidden confounders is rarely possible. Recent work has extended prediction intervals with finite-sample guarantees to partially identifiable causal outcomes, by means of a sensitivity model for hidden confounding. In deep learning, predictors can exploit their inductive biases for better generalization out of sample. We arg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.09520v2-abstract-full').style.display = 'inline'; document.getElementById('2306.09520v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.09520v2-abstract-full" style="display: none;"> Causal inference of exact individual treatment outcomes in the presence of hidden confounders is rarely possible. Recent work has extended prediction intervals with finite-sample guarantees to partially identifiable causal outcomes, by means of a sensitivity model for hidden confounding. In deep learning, predictors can exploit their inductive biases for better generalization out of sample. We argue that the structure inherent to a deep ensemble should inform a tighter partial identification of the causal outcomes that they predict. We therefore introduce an approach termed Caus-Modens, for characterizing causal outcome intervals by modulated ensembles. We present a simple approach to partial identification using existing causal sensitivity models and show empirically that Caus-Modens gives tighter outcome intervals, as measured by the necessary interval size to achieve sufficient coverage. The last of our three diverse benchmarks is a novel usage of GPT-4 for observational experiments with unknown but probeable ground truth. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.09520v2-abstract-full').style.display = 'none'; document.getElementById('2306.09520v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.06302">arXiv:2306.06302</a> <span> [<a href="https://arxiv.org/pdf/2306.06302">pdf</a>, <a href="https://arxiv.org/format/2306.06302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Task Knowledge Enhancement for Zero-Shot and Multi-Domain Recommendation in an AI Assistant Application </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Markowitz%2C+E">Elan Markowitz</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xing Fan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tony Chen</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.06302v1-abstract-short" style="display: inline;"> Recommender systems have found significant commercial success but still struggle with integrating new users. Since users often interact with content in different domains, it is possible to leverage a user's interactions in previous domains to improve that user's recommendations in a new one (multi-domain recommendation). A separate research thread on knowledge graph enhancement uses external knowl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06302v1-abstract-full').style.display = 'inline'; document.getElementById('2306.06302v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.06302v1-abstract-full" style="display: none;"> Recommender systems have found significant commercial success but still struggle with integrating new users. Since users often interact with content in different domains, it is possible to leverage a user's interactions in previous domains to improve that user's recommendations in a new one (multi-domain recommendation). A separate research thread on knowledge graph enhancement uses external knowledge graphs to improve single domain recommendations (knowledge graph enhancement). Both research threads incorporate related information to improve predictions in a new domain. We propose in this work to unify these approaches: Using information from interactions in other domains as well as external knowledge graphs to make predictions in a new domain that would be impossible with either information source alone. We apply these ideas to a dataset derived from millions of users' requests for content across three domains (videos, music, and books) in a live virtual assistant application. We demonstrate the advantage of combining knowledge graph enhancement with previous multi-domain recommendation techniques to provide better overall recommendations as well as for better recommendations on new users of a domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06302v1-abstract-full').style.display = 'none'; document.getElementById('2306.06302v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.19264">arXiv:2305.19264</a> <span> [<a href="https://arxiv.org/pdf/2305.19264">pdf</a>, <a href="https://arxiv.org/format/2305.19264">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Jointly Reparametrized Multi-Layer Adaptation for Efficient and Private Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Umang Gupta</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.19264v1-abstract-short" style="display: inline;"> Efficient finetuning of pretrained language transformers is becoming increasingly prevalent for solving natural language processing tasks. While effective, it can still require a large number of tunable parameters. This can be a drawback for low-resource applications and training with differential-privacy constraints, where excessive noise may be introduced during finetuning. To this end, we propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19264v1-abstract-full').style.display = 'inline'; document.getElementById('2305.19264v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.19264v1-abstract-full" style="display: none;"> Efficient finetuning of pretrained language transformers is becoming increasingly prevalent for solving natural language processing tasks. While effective, it can still require a large number of tunable parameters. This can be a drawback for low-resource applications and training with differential-privacy constraints, where excessive noise may be introduced during finetuning. To this end, we propose a novel language transformer finetuning strategy that introduces task-specific parameters in multiple transformer layers. These parameters are derived from fixed random projections of a single trainable vector, enabling finetuning with significantly fewer parameters while maintaining performance. We achieve within 5% of full finetuning performance on GLUE tasks with as few as 4,100 parameters per task, outperforming other parameter-efficient finetuning approaches that use a similar number of per-task parameters. Besides, the random projections can be precomputed at inference, avoiding additional computational latency. All these make our method particularly appealing for low-resource applications. Finally, our method achieves the best or comparable utility compared to several recent finetuning methods when training with the same privacy constraints, underscoring its effectiveness and potential real-world impact. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19264v1-abstract-full').style.display = 'none'; document.getElementById('2305.19264v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in the Findings of ACL 2023. Code available at https://github.com/umgupta/jointly-reparametrized-finetuning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.16597">arXiv:2305.16597</a> <span> [<a href="https://arxiv.org/pdf/2305.16597">pdf</a>, <a href="https://arxiv.org/format/2305.16597">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neural Architecture Search for Parameter-Efficient Fine-tuning of Large Pre-trained Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lawton%2C+N">Neal Lawton</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Anoop Kumar</a>, <a href="/search/cs?searchtype=author&query=Thattai%2C+G">Govind Thattai</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.16597v1-abstract-short" style="display: inline;"> Parameter-efficient tuning (PET) methods fit pre-trained language models (PLMs) to downstream tasks by either computing a small compressed update for a subset of model parameters, or appending and fine-tuning a small number of new model parameters to the pre-trained network. Hand-designed PET architectures from the literature perform well in practice, but have the potential to be improved via auto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.16597v1-abstract-full').style.display = 'inline'; document.getElementById('2305.16597v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.16597v1-abstract-full" style="display: none;"> Parameter-efficient tuning (PET) methods fit pre-trained language models (PLMs) to downstream tasks by either computing a small compressed update for a subset of model parameters, or appending and fine-tuning a small number of new model parameters to the pre-trained network. Hand-designed PET architectures from the literature perform well in practice, but have the potential to be improved via automated neural architecture search (NAS). We propose an efficient NAS method for learning PET architectures via structured and unstructured pruning. We present experiments on GLUE demonstrating the effectiveness of our algorithm and discuss how PET architectural design choices affect performance in practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.16597v1-abstract-full').style.display = 'none'; document.getElementById('2305.16597v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures, ACL 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.10625">arXiv:2305.10625</a> <span> [<a href="https://arxiv.org/pdf/2305.10625">pdf</a>, <a href="https://arxiv.org/format/2305.10625">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Measuring and Mitigating Local Instability in Deep Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Datta%2C+A">Arghya Datta</a>, <a href="/search/cs?searchtype=author&query=Nandi%2C+S">Subhrangshu Nandi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingcheng Xu</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">He Xie</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Anoop Kumar</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.10625v2-abstract-short" style="display: inline;"> Deep Neural Networks (DNNs) are becoming integral components of real world services relied upon by millions of users. Unfortunately, architects of these systems can find it difficult to ensure reliable performance as irrelevant details like random initialization can unexpectedly change the outputs of a trained system with potentially disastrous consequences. We formulate the model stability proble… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10625v2-abstract-full').style.display = 'inline'; document.getElementById('2305.10625v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.10625v2-abstract-full" style="display: none;"> Deep Neural Networks (DNNs) are becoming integral components of real world services relied upon by millions of users. Unfortunately, architects of these systems can find it difficult to ensure reliable performance as irrelevant details like random initialization can unexpectedly change the outputs of a trained system with potentially disastrous consequences. We formulate the model stability problem by studying how the predictions of a model change, even when it is retrained on the same data, as a consequence of stochasticity in the training process. For Natural Language Understanding (NLU) tasks, we find instability in predictions for a significant fraction of queries. We formulate principled metrics, like per-sample ``label entropy'' across training runs or within a single training run, to quantify this phenomenon. Intriguingly, we find that unstable predictions do not appear at random, but rather appear to be clustered in data-specific ways. We study data-agnostic regularization methods to improve stability and propose new data-centric methods that exploit our local stability estimates. We find that our localized data-specific mitigation strategy dramatically outperforms data-agnostic methods, and comes within 90% of the gold standard, achieved by ensembling, at a fraction of the computational cost <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10625v2-abstract-full').style.display = 'none'; document.getElementById('2305.10625v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in Findings of the Association for Computational Linguistics (ACL), 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.06992">arXiv:2303.06992</a> <span> [<a href="https://arxiv.org/pdf/2303.06992">pdf</a>, <a href="https://arxiv.org/format/2303.06992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Improving Mutual Information Estimation with Annealed and Energy-Based Bounds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Brekelmans%2C+R">Rob Brekelmans</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Sicong Huang</a>, <a href="/search/cs?searchtype=author&query=Ghassemi%2C+M">Marzyeh Ghassemi</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Grosse%2C+R">Roger Grosse</a>, <a href="/search/cs?searchtype=author&query=Makhzani%2C+A">Alireza Makhzani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.06992v1-abstract-short" style="display: inline;"> Mutual information (MI) is a fundamental quantity in information theory and machine learning. However, direct estimation of MI is intractable, even if the true joint probability density for the variables of interest is known, as it involves estimating a potentially high-dimensional log partition function. In this work, we present a unifying view of existing MI bounds from the perspective of import… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06992v1-abstract-full').style.display = 'inline'; document.getElementById('2303.06992v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.06992v1-abstract-full" style="display: none;"> Mutual information (MI) is a fundamental quantity in information theory and machine learning. However, direct estimation of MI is intractable, even if the true joint probability density for the variables of interest is known, as it involves estimating a potentially high-dimensional log partition function. In this work, we present a unifying view of existing MI bounds from the perspective of importance sampling, and propose three novel bounds based on this approach. Since accurate estimation of MI without density information requires a sample size exponential in the true MI, we assume either a single marginal or the full joint density information is known. In settings where the full joint density is available, we propose Multi-Sample Annealed Importance Sampling (AIS) bounds on MI, which we demonstrate can tightly estimate large values of MI in our experiments. In settings where only a single marginal distribution is known, we propose Generalized IWAE (GIWAE) and MINE-AIS bounds. Our GIWAE bound unifies variational and contrastive bounds in a single framework that generalizes InfoNCE, IWAE, and Barber-Agakov bounds. Our MINE-AIS method improves upon existing energy-based methods such as MINE-DV and MINE-F by directly optimizing a tighter lower bound on MI. MINE-AIS uses MCMC sampling to estimate gradients for training and Multi-Sample AIS for evaluating the bound. Our methods are particularly suitable for evaluating MI in deep generative models, since explicit forms of the marginal or joint densities are often available. We evaluate our bounds on estimating the MI of VAEs and GANs trained on the MNIST and CIFAR datasets, and showcase significant gains over existing bounds in these challenging settings with high ground truth MI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06992v1-abstract-full').style.display = 'none'; document.getElementById('2303.06992v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A shorter version appeared in the International Conference on Learning Representations (ICLR) 2022</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICLR 2022 https://openreview.net/forum?id=T0B9AoM_bFg </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.01491">arXiv:2303.01491</a> <span> [<a href="https://arxiv.org/pdf/2303.01491">pdf</a>, <a href="https://arxiv.org/format/2303.01491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Transferring Models Trained on Natural Images to 3D MRI via Position Encoded Slice Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Umang Gupta</a>, <a href="/search/cs?searchtype=author&query=Chattopadhyay%2C+T">Tamoghna Chattopadhyay</a>, <a href="/search/cs?searchtype=author&query=Dhinagar%2C+N">Nikhil Dhinagar</a>, <a href="/search/cs?searchtype=author&query=Thompson%2C+P+M">Paul M. Thompson</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Initiative%2C+T+A+D+N">The Alzheimer's Disease Neuroimaging Initiative</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.01491v1-abstract-short" style="display: inline;"> Transfer learning has remarkably improved computer vision. These advances also promise improvements in neuroimaging, where training set sizes are often small. However, various difficulties arise in directly applying models pretrained on natural images to radiologic images, such as MRIs. In particular, a mismatch in the input space (2D images vs. 3D MRIs) restricts the direct transfer of models, of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01491v1-abstract-full').style.display = 'inline'; document.getElementById('2303.01491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.01491v1-abstract-full" style="display: none;"> Transfer learning has remarkably improved computer vision. These advances also promise improvements in neuroimaging, where training set sizes are often small. However, various difficulties arise in directly applying models pretrained on natural images to radiologic images, such as MRIs. In particular, a mismatch in the input space (2D images vs. 3D MRIs) restricts the direct transfer of models, often forcing us to consider only a few MRI slices as input. To this end, we leverage the 2D-Slice-CNN architecture of Gupta et al. (2021), which embeds all the MRI slices with 2D encoders (neural networks that take 2D image input) and combines them via permutation-invariant layers. With the insight that the pretrained model can serve as the 2D encoder, we initialize the 2D encoder with ImageNet pretrained weights that outperform those initialized and trained from scratch on two neuroimaging tasks -- brain age prediction on the UK Biobank dataset and Alzheimer's disease detection on the ADNI dataset. Further, we improve the modeling capabilities of 2D-Slice models by incorporating spatial information through position embeddings, which can improve the performance in some cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01491v1-abstract-full').style.display = 'none'; document.getElementById('2303.01491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at IEEE International Symposium on Biomedical Imaging 2023 (ISBI 2023). Code is available at https://github.com/umgupta/2d-slice-set-networks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.03792">arXiv:2302.03792</a> <span> [<a href="https://arxiv.org/pdf/2302.03792">pdf</a>, <a href="https://arxiv.org/format/2302.03792">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Information-Theoretic Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+X">Xianghao Kong</a>, <a href="/search/cs?searchtype=author&query=Brekelmans%2C+R">Rob Brekelmans</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.03792v1-abstract-short" style="display: inline;"> Denoising diffusion models have spurred significant gains in density modeling and image generation, precipitating an industrial revolution in text-guided AI art generation. We introduce a new mathematical foundation for diffusion models inspired by classic results in information theory that connect Information with Minimum Mean Square Error regression, the so-called I-MMSE relations. We generalize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.03792v1-abstract-full').style.display = 'inline'; document.getElementById('2302.03792v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.03792v1-abstract-full" style="display: none;"> Denoising diffusion models have spurred significant gains in density modeling and image generation, precipitating an industrial revolution in text-guided AI art generation. We introduce a new mathematical foundation for diffusion models inspired by classic results in information theory that connect Information with Minimum Mean Square Error regression, the so-called I-MMSE relations. We generalize the I-MMSE relations to exactly relate the data distribution to an optimal denoising regression problem, leading to an elegant refinement of existing diffusion bounds. This new insight leads to several improvements for probability distribution estimation, including theoretical justification for diffusion model ensembling. Remarkably, our framework shows how continuous and discrete probabilities can be learned with the same regression objective, avoiding domain-specific generative models used in variational methods. Code to reproduce experiments is provided at http://github.com/kxh001/ITdiffusion and simplified demonstration code is at http://github.com/gregversteeg/InfoDiffusionSimple. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.03792v1-abstract-full').style.display = 'none'; document.getElementById('2302.03792v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 7 figures, International Conference on Learning Representations (ICLR), 2023. Code is at http://github.com/kxh001/ITdiffusion and http://github.com/gregversteeg/InfoDiffusionSimple</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.11669">arXiv:2208.11669</a> <span> [<a href="https://arxiv.org/pdf/2208.11669">pdf</a>, <a href="https://arxiv.org/format/2208.11669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Towards Sparsified Federated Neuroimaging Models via Weight Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Stripelis%2C+D">Dimitris Stripelis</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Umang Gupta</a>, <a href="/search/cs?searchtype=author&query=Dhinagar%2C+N">Nikhil Dhinagar</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Thompson%2C+P">Paul Thompson</a>, <a href="/search/cs?searchtype=author&query=Ambite%2C+J+L">Jos茅 Luis Ambite</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.11669v1-abstract-short" style="display: inline;"> Federated training of large deep neural networks can often be restrictive due to the increasing costs of communicating the updates with increasing model sizes. Various model pruning techniques have been designed in centralized settings to reduce inference times. Combining centralized pruning techniques with federated training seems intuitive for reducing communication costs -- by pruning the model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.11669v1-abstract-full').style.display = 'inline'; document.getElementById('2208.11669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.11669v1-abstract-full" style="display: none;"> Federated training of large deep neural networks can often be restrictive due to the increasing costs of communicating the updates with increasing model sizes. Various model pruning techniques have been designed in centralized settings to reduce inference times. Combining centralized pruning techniques with federated training seems intuitive for reducing communication costs -- by pruning the model parameters right before the communication step. Moreover, such a progressive model pruning approach during training can also reduce training times/costs. To this end, we propose FedSparsify, which performs model pruning during federated training. In our experiments in centralized and federated settings on the brain age prediction task (estimating a person's age from their brain MRI), we demonstrate that models can be pruned up to 95% sparsity without affecting performance even in challenging federated learning environments with highly heterogeneous data distributions. One surprising benefit of model pruning is improved model privacy. We demonstrate that models with high sparsity are less susceptible to membership inference attacks, a type of privacy attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.11669v1-abstract-full').style.display = 'none'; document.getElementById('2208.11669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to 3rd MICCAI Workshop on Distributed, Collaborative and Federated Learning (DeCaF, 2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.06915">arXiv:2205.06915</a> <span> [<a href="https://arxiv.org/pdf/2205.06915">pdf</a>, <a href="https://arxiv.org/ps/2205.06915">ps</a>, <a href="https://arxiv.org/format/2205.06915">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ITW54588.2022.9965850">10.1109/ITW54588.2022.9965850 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Formal limitations of sample-wise information-theoretic generalization bounds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+H">Hrayr Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.06915v2-abstract-short" style="display: inline;"> Some of the tightest information-theoretic generalization bounds depend on the average information between the learned hypothesis and a single training example. However, these sample-wise bounds were derived only for expected generalization gap. We show that even for expected squared generalization gap no such sample-wise information-theoretic bounds exist. The same is true for PAC-Bayes and singl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.06915v2-abstract-full').style.display = 'inline'; document.getElementById('2205.06915v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.06915v2-abstract-full" style="display: none;"> Some of the tightest information-theoretic generalization bounds depend on the average information between the learned hypothesis and a single training example. However, these sample-wise bounds were derived only for expected generalization gap. We show that even for expected squared generalization gap no such sample-wise information-theoretic bounds exist. The same is true for PAC-Bayes and single-draw bounds. Remarkably, PAC-Bayes, single-draw and expected squared generalization gap bounds that depend on information in pairs of examples exist. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.06915v2-abstract-full').style.display = 'none'; document.getElementById('2205.06915v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2022 IEEE Information Theory Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.05249">arXiv:2205.05249</a> <span> [<a href="https://arxiv.org/pdf/2205.05249">pdf</a>, <a href="https://arxiv.org/format/2205.05249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Secure & Private Federated Neuroimaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Stripelis%2C+D">Dimitris Stripelis</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Umang Gupta</a>, <a href="/search/cs?searchtype=author&query=Saleem%2C+H">Hamza Saleem</a>, <a href="/search/cs?searchtype=author&query=Dhinagar%2C+N">Nikhil Dhinagar</a>, <a href="/search/cs?searchtype=author&query=Ghai%2C+T">Tanmay Ghai</a>, <a href="/search/cs?searchtype=author&query=Anastasiou%2C+R+C">Rafael Chrysovalantis Anastasiou</a>, <a href="/search/cs?searchtype=author&query=Asghar%2C+A">Armaghan Asghar</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Ravi%2C+S">Srivatsan Ravi</a>, <a href="/search/cs?searchtype=author&query=Naveed%2C+M">Muhammad Naveed</a>, <a href="/search/cs?searchtype=author&query=Thompson%2C+P+M">Paul M. Thompson</a>, <a href="/search/cs?searchtype=author&query=Ambite%2C+J+L">Jose Luis Ambite</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.05249v2-abstract-short" style="display: inline;"> The amount of biomedical data continues to grow rapidly. However, collecting data from multiple sites for joint analysis remains challenging due to security, privacy, and regulatory concerns. To overcome this challenge, we use Federated Learning, which enables distributed training of neural network models over multiple data sources without sharing data. Each site trains the neural network over its… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.05249v2-abstract-full').style.display = 'inline'; document.getElementById('2205.05249v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.05249v2-abstract-full" style="display: none;"> The amount of biomedical data continues to grow rapidly. However, collecting data from multiple sites for joint analysis remains challenging due to security, privacy, and regulatory concerns. To overcome this challenge, we use Federated Learning, which enables distributed training of neural network models over multiple data sources without sharing data. Each site trains the neural network over its private data for some time, then shares the neural network parameters (i.e., weights, gradients) with a Federation Controller, which in turn aggregates the local models, sends the resulting community model back to each site, and the process repeats. Our Federated Learning architecture, MetisFL, provides strong security and privacy. First, sample data never leaves a site. Second, neural network parameters are encrypted before transmission and the global neural model is computed under fully-homomorphic encryption. Finally, we use information-theoretic methods to limit information leakage from the neural model to prevent a curious site from performing model inversion or membership attacks. We present a thorough evaluation of the performance of secure, private federated learning in neuroimaging tasks, including for predicting Alzheimer's disease and estimating BrainAGE from magnetic resonance imaging (MRI) studies, in challenging, heterogeneous federated environments where sites have different amounts of data and statistical distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.05249v2-abstract-full').style.display = 'none'; document.getElementById('2205.05249v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 13 figures, 2 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; I.5.1; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.12430">arXiv:2204.12430</a> <span> [<a href="https://arxiv.org/pdf/2204.12430">pdf</a>, <a href="https://arxiv.org/format/2204.12430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Federated Progressive Sparsification (Purge, Merge, Tune)+ </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Stripelis%2C+D">Dimitris Stripelis</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Umang Gupta</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Ambite%2C+J+L">Jose Luis Ambite</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.12430v2-abstract-short" style="display: inline;"> To improve federated training of neural networks, we develop FedSparsify, a sparsification strategy based on progressive weight magnitude pruning. Our method has several benefits. First, since the size of the network becomes increasingly smaller, computation and communication costs during training are reduced. Second, the models are incrementally constrained to a smaller set of parameters, which f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.12430v2-abstract-full').style.display = 'inline'; document.getElementById('2204.12430v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.12430v2-abstract-full" style="display: none;"> To improve federated training of neural networks, we develop FedSparsify, a sparsification strategy based on progressive weight magnitude pruning. Our method has several benefits. First, since the size of the network becomes increasingly smaller, computation and communication costs during training are reduced. Second, the models are incrementally constrained to a smaller set of parameters, which facilitates alignment/merging of the local models and improved learning performance at high sparsification rates. Third, the final sparsified model is significantly smaller, which improves inference efficiency and optimizes operations latency during encrypted communication. We show experimentally that FedSparsify learns a subnetwork of both high sparsity and learning performance. Our sparse models can reach a tenth of the size of the original model with the same or better accuracy compared to existing pruning and nonpruning baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.12430v2-abstract-full').style.display = 'none'; document.getElementById('2204.12430v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the Workshop on Federated Learning: Recent Advances and New Challenges, in Conjunction with NeurIPS 2022 (FL-NeurIPS'22) 23 pages, 12 figures, 1 algorithm, 2 Tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.m </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.11206">arXiv:2204.11206</a> <span> [<a href="https://arxiv.org/pdf/2204.11206">pdf</a>, <a href="https://arxiv.org/format/2204.11206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Partial Identification of Dose Responses with Hidden Confounders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Marmarelis%2C+M+G">Myrl G. Marmarelis</a>, <a href="/search/cs?searchtype=author&query=Haddad%2C+E">Elizabeth Haddad</a>, <a href="/search/cs?searchtype=author&query=Jesson%2C+A">Andrew Jesson</a>, <a href="/search/cs?searchtype=author&query=Jahanshad%2C+N">Neda Jahanshad</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.11206v3-abstract-short" style="display: inline;"> Inferring causal effects of continuous-valued treatments from observational data is a crucial task promising to better inform policy- and decision-makers. A critical assumption needed to identify these effects is that all confounding variables -- causal parents of both the treatment and the outcome -- are included as covariates. Unfortunately, given observational data alone, we cannot know with ce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11206v3-abstract-full').style.display = 'inline'; document.getElementById('2204.11206v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.11206v3-abstract-full" style="display: none;"> Inferring causal effects of continuous-valued treatments from observational data is a crucial task promising to better inform policy- and decision-makers. A critical assumption needed to identify these effects is that all confounding variables -- causal parents of both the treatment and the outcome -- are included as covariates. Unfortunately, given observational data alone, we cannot know with certainty that this criterion is satisfied. Sensitivity analyses provide principled ways to give bounds on causal estimates when confounding variables are hidden. While much attention is focused on sensitivity analyses for discrete-valued treatments, much less is paid to continuous-valued treatments. We present novel methodology to bound both average and conditional average continuous-valued treatment-effect estimates when they cannot be point identified due to hidden confounding. A semi-synthetic benchmark on multiple datasets shows our method giving tighter coverage of the true dose-response curve than a recently proposed continuous sensitivity model and baselines. Finally, we apply our method to a real-world observational case study to demonstrate the value of identifying dose-dependent causal effects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11206v3-abstract-full').style.display = 'none'; document.getElementById('2204.11206v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.12574">arXiv:2203.12574</a> <span> [<a href="https://arxiv.org/pdf/2203.12574">pdf</a>, <a href="https://arxiv.org/format/2203.12574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Gender Bias in Distilled Language Models via Counterfactual Role Reversal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Umang Gupta</a>, <a href="/search/cs?searchtype=author&query=Dhamala%2C+J">Jwala Dhamala</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+V">Varun Kumar</a>, <a href="/search/cs?searchtype=author&query=Verma%2C+A">Apurv Verma</a>, <a href="/search/cs?searchtype=author&query=Pruksachatkun%2C+Y">Yada Pruksachatkun</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+S">Satyapriya Krishna</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+R">Rahul Gupta</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.12574v1-abstract-short" style="display: inline;"> Language models excel at generating coherent text, and model compression techniques such as knowledge distillation have enabled their use in resource-constrained settings. However, these models can be biased in multiple ways, including the unfounded association of male and female genders with gender-neutral professions. Therefore, knowledge distillation without any fairness constraints may preserv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.12574v1-abstract-full').style.display = 'inline'; document.getElementById('2203.12574v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.12574v1-abstract-full" style="display: none;"> Language models excel at generating coherent text, and model compression techniques such as knowledge distillation have enabled their use in resource-constrained settings. However, these models can be biased in multiple ways, including the unfounded association of male and female genders with gender-neutral professions. Therefore, knowledge distillation without any fairness constraints may preserve or exaggerate the teacher model's biases onto the distilled model. To this end, we present a novel approach to mitigate gender disparity in text generation by learning a fair model during knowledge distillation. We propose two modifications to the base knowledge distillation based on counterfactual role reversal$\unicode{x2014}$modifying teacher probabilities and augmenting the training set. We evaluate gender polarity across professions in open-ended text generated from the resulting distilled and finetuned GPT$\unicode{x2012}$2 models and demonstrate a substantial reduction in gender disparity with only a minor compromise in utility. Finally, we observe that language models that reduce gender polarity in language generation do not improve embedding fairness or downstream classification fairness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.12574v1-abstract-full').style.display = 'none'; document.getElementById('2203.12574v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in the Findings of ACL 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.10204">arXiv:2203.10204</a> <span> [<a href="https://arxiv.org/pdf/2203.10204">pdf</a>, <a href="https://arxiv.org/format/2203.10204">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Disordered Systems and Neural Networks">cond-mat.dis-nn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Inferring topological transitions in pattern-forming processes with self-supervised learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abram%2C+M">Marcin Abram</a>, <a href="/search/cs?searchtype=author&query=Burghardt%2C+K">Keith Burghardt</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Dingreville%2C+R">Remi Dingreville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.10204v2-abstract-short" style="display: inline;"> The identification and classification of transitions in topological and microstructural regimes in pattern-forming processes are critical for understanding and fabricating microstructurally precise novel materials in many application domains. Unfortunately, relevant microstructure transitions may depend on process parameters in subtle and complex ways that are not captured by the classic theory of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.10204v2-abstract-full').style.display = 'inline'; document.getElementById('2203.10204v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.10204v2-abstract-full" style="display: none;"> The identification and classification of transitions in topological and microstructural regimes in pattern-forming processes are critical for understanding and fabricating microstructurally precise novel materials in many application domains. Unfortunately, relevant microstructure transitions may depend on process parameters in subtle and complex ways that are not captured by the classic theory of phase transition. While supervised machine learning methods may be useful for identifying transition regimes, they need labels which require prior knowledge of order parameters or relevant structures describing these transitions. Motivated by the universality principle for dynamical systems, we instead use a self-supervised approach to solve the inverse problem of predicting process parameters from observed microstructures using neural networks. This approach does not require predefined, labeled data about the different classes of microstructural patterns or about the target task of predicting microstructure transitions. We show that the difficulty of performing the inverse-problem prediction task is related to the goal of discovering microstructure regimes, because qualitative changes in microstructural patterns correspond to changes in uncertainty predictions for our self-supervised problem. We demonstrate the value of our approach by automatically discovering transitions in microstructural regimes in two distinct pattern-forming processes: the spinodal decomposition of a two-phase mixture and the formation of concentration modulations of binary alloys during physical vapor deposition of thin films. This approach opens a promising path forward for discovering and understanding unseen or hard-to-discern transition regimes, and ultimately for controlling complex pattern-forming processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.10204v2-abstract-full').style.display = 'none'; document.getElementById('2203.10204v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 6 figures, 8 pages of supplementary information</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.4.7; I.5.4; I.6.m; J.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.13733">arXiv:2111.13733</a> <span> [<a href="https://arxiv.org/pdf/2111.13733">pdf</a>, <a href="https://arxiv.org/format/2111.13733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Failure Modes of Domain Generalization Algorithms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Galstyan%2C+T">Tigran Galstyan</a>, <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+H">Hrayr Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Khachatrian%2C+H">Hrant Khachatrian</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.13733v1-abstract-short" style="display: inline;"> Domain generalization algorithms use training data from multiple domains to learn models that generalize well to unseen domains. While recently proposed benchmarks demonstrate that most of the existing algorithms do not outperform simple baselines, the established evaluation methods fail to expose the impact of various factors that contribute to the poor performance. In this paper we propose an ev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.13733v1-abstract-full').style.display = 'inline'; document.getElementById('2111.13733v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.13733v1-abstract-full" style="display: none;"> Domain generalization algorithms use training data from multiple domains to learn models that generalize well to unseen domains. While recently proposed benchmarks demonstrate that most of the existing algorithms do not outperform simple baselines, the established evaluation methods fail to expose the impact of various factors that contribute to the poor performance. In this paper we propose an evaluation framework for domain generalization algorithms that allows decomposition of the error into components capturing distinct aspects of generalization. Inspired by the prevalence of algorithms based on the idea of domain-invariant representation learning, we extend the evaluation framework to capture various types of failures in achieving invariance. We show that the largest contributor to the generalization error varies across methods, datasets, regularization strengths and even training lengths. We observe two problems associated with the strategy of learning domain-invariant representations. On Colored MNIST, most domain generalization algorithms fail because they reach domain-invariance only on the training domains. On Camelyon-17, domain-invariance degrades the quality of representations on unseen domains. We hypothesize that focusing instead on tuning the classifier on top of a rich representation can be a promising direction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.13733v1-abstract-full').style.display = 'none'; document.getElementById('2111.13733v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.06312">arXiv:2111.06312</a> <span> [<a href="https://arxiv.org/pdf/2111.06312">pdf</a>, <a href="https://arxiv.org/format/2111.06312">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Mathematical Software">cs.MS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Implicit SVD for Graph Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abu-El-Haija%2C+S">Sami Abu-El-Haija</a>, <a href="/search/cs?searchtype=author&query=Mostafa%2C+H">Hesham Mostafa</a>, <a href="/search/cs?searchtype=author&query=Nassar%2C+M">Marcel Nassar</a>, <a href="/search/cs?searchtype=author&query=Crespi%2C+V">Valentino Crespi</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.06312v1-abstract-short" style="display: inline;"> Recent improvements in the performance of state-of-the-art (SOTA) methods for Graph Representational Learning (GRL) have come at the cost of significant computational resource requirements for training, e.g., for calculating gradients via backprop over many data epochs. Meanwhile, Singular Value Decomposition (SVD) can find closed-form solutions to convex problems, using merely a handful of epochs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.06312v1-abstract-full').style.display = 'inline'; document.getElementById('2111.06312v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.06312v1-abstract-full" style="display: none;"> Recent improvements in the performance of state-of-the-art (SOTA) methods for Graph Representational Learning (GRL) have come at the cost of significant computational resource requirements for training, e.g., for calculating gradients via backprop over many data epochs. Meanwhile, Singular Value Decomposition (SVD) can find closed-form solutions to convex problems, using merely a handful of epochs. In this paper, we make GRL more computationally tractable for those with modest hardware. We design a framework that computes SVD of \textit{implicitly} defined matrices, and apply this framework to several GRL tasks. For each task, we derive linear approximation of a SOTA model, where we design (expensive-to-store) matrix $\mathbf{M}$ and train the model, in closed-form, via SVD of $\mathbf{M}$, without calculating entries of $\mathbf{M}$. By converging to a unique point in one step, and without calculating gradients, our models show competitive empirical test performance over various graphs such as article citation and biological interaction networks. More importantly, SVD can initialize a deeper model, that is architected to be non-linear almost everywhere, though behaves linearly when its parameters reside on a hyperplane, onto which SVD initializes. The deeper model can then be fine-tuned within only a few epochs. Overall, our procedure trains hundreds of times faster than state-of-the-art methods, while competing on empirical test performance. We open-source our implementation at: https://github.com/samihaija/isvd <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.06312v1-abstract-full').style.display = 'none'; document.getElementById('2111.06312v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Advances in Neural Information Processing Systems (NeurIPS) 2021 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.02434">arXiv:2111.02434</a> <span> [<a href="https://arxiv.org/pdf/2111.02434">pdf</a>, <a href="https://arxiv.org/format/2111.02434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> Hamiltonian Dynamics with Non-Newtonian Momentum for Rapid Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.02434v3-abstract-short" style="display: inline;"> Sampling from an unnormalized probability distribution is a fundamental problem in machine learning with applications including Bayesian modeling, latent factor inference, and energy-based model training. After decades of research, variations of MCMC remain the default approach to sampling despite slow convergence. Auxiliary neural models can learn to speed up MCMC, but the overhead for training t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02434v3-abstract-full').style.display = 'inline'; document.getElementById('2111.02434v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.02434v3-abstract-full" style="display: none;"> Sampling from an unnormalized probability distribution is a fundamental problem in machine learning with applications including Bayesian modeling, latent factor inference, and energy-based model training. After decades of research, variations of MCMC remain the default approach to sampling despite slow convergence. Auxiliary neural models can learn to speed up MCMC, but the overhead for training the extra model can be prohibitive. We propose a fundamentally different approach to this problem via a new Hamiltonian dynamics with a non-Newtonian momentum. In contrast to MCMC approaches like Hamiltonian Monte Carlo, no stochastic step is required. Instead, the proposed deterministic dynamics in an extended state space exactly sample the target distribution, specified by an energy function, under an assumption of ergodicity. Alternatively, the dynamics can be interpreted as a normalizing flow that samples a specified energy model without training. The proposed Energy Sampling Hamiltonian (ESH) dynamics have a simple form that can be solved with existing ODE solvers, but we derive a specialized solver that exhibits much better performance. ESH dynamics converge faster than their MCMC competitors enabling faster, more stable training of neural network energy models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02434v3-abstract-full').style.display = 'none'; document.getElementById('2111.02434v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 19 figures. Advances in Neural Information Processing Systems (NeurIPS), 2021. Animations at https://sites.google.com/view/esh-dynamics/home, code at https://github.com/gregversteeg/esh_dynamics</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.01584">arXiv:2110.01584</a> <span> [<a href="https://arxiv.org/pdf/2110.01584">pdf</a>, <a href="https://arxiv.org/format/2110.01584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Information-theoretic generalization bounds for black-box learning algorithms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+H">Hrayr Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Raginsky%2C+M">Maxim Raginsky</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.01584v2-abstract-short" style="display: inline;"> We derive information-theoretic generalization bounds for supervised learning algorithms based on the information contained in predictions rather than in the output of the training algorithm. These bounds improve over the existing information-theoretic bounds, are applicable to a wider range of algorithms, and solve two key challenges: (a) they give meaningful results for deterministic algorithms… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.01584v2-abstract-full').style.display = 'inline'; document.getElementById('2110.01584v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.01584v2-abstract-full" style="display: none;"> We derive information-theoretic generalization bounds for supervised learning algorithms based on the information contained in predictions rather than in the output of the training algorithm. These bounds improve over the existing information-theoretic bounds, are applicable to a wider range of algorithms, and solve two key challenges: (a) they give meaningful results for deterministic algorithms and (b) they are significantly easier to estimate. We show experimentally that the proposed bounds closely follow the generalization gap in practical scenarios for deep learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.01584v2-abstract-full').style.display = 'none'; document.getElementById('2110.01584v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.03952">arXiv:2109.03952</a> <span> [<a href="https://arxiv.org/pdf/2109.03952">pdf</a>, <a href="https://arxiv.org/format/2109.03952">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Attributing Fair Decisions with Attention Interventions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mehrabi%2C+N">Ninareh Mehrabi</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Umang Gupta</a>, <a href="/search/cs?searchtype=author&query=Morstatter%2C+F">Fred Morstatter</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.03952v1-abstract-short" style="display: inline;"> The widespread use of Artificial Intelligence (AI) in consequential domains, such as healthcare and parole decision-making systems, has drawn intense scrutiny on the fairness of these methods. However, ensuring fairness is often insufficient as the rationale for a contentious decision needs to be audited, understood, and defended. We propose that the attention mechanism can be used to ensure fair… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.03952v1-abstract-full').style.display = 'inline'; document.getElementById('2109.03952v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.03952v1-abstract-full" style="display: none;"> The widespread use of Artificial Intelligence (AI) in consequential domains, such as healthcare and parole decision-making systems, has drawn intense scrutiny on the fairness of these methods. However, ensuring fairness is often insufficient as the rationale for a contentious decision needs to be audited, understood, and defended. We propose that the attention mechanism can be used to ensure fair outcomes while simultaneously providing feature attributions to account for how a decision was made. Toward this goal, we design an attention-based model that can be leveraged as an attribution framework. It can identify features responsible for both performance and fairness of the model through attention interventions and attention weight manipulation. Using this attribution framework, we then design a post-processing bias mitigation strategy and compare it with a suite of baselines. We demonstrate the versatility of our approach by conducting experiments on two distinct data types, tabular and textual. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.03952v1-abstract-full').style.display = 'none'; document.getElementById('2109.03952v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.03437">arXiv:2108.03437</a> <span> [<a href="https://arxiv.org/pdf/2108.03437">pdf</a>, <a href="https://arxiv.org/format/2108.03437">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Secure Neuroimaging Analysis using Federated Learning with Homomorphic Encryption </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Stripelis%2C+D">Dimitris Stripelis</a>, <a href="/search/cs?searchtype=author&query=Saleem%2C+H">Hamza Saleem</a>, <a href="/search/cs?searchtype=author&query=Ghai%2C+T">Tanmay Ghai</a>, <a href="/search/cs?searchtype=author&query=Dhinagar%2C+N">Nikhil Dhinagar</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Umang Gupta</a>, <a href="/search/cs?searchtype=author&query=Anastasiou%2C+C">Chrysovalantis Anastasiou</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Ravi%2C+S">Srivatsan Ravi</a>, <a href="/search/cs?searchtype=author&query=Naveed%2C+M">Muhammad Naveed</a>, <a href="/search/cs?searchtype=author&query=Thompson%2C+P+M">Paul M. Thompson</a>, <a href="/search/cs?searchtype=author&query=Ambite%2C+J+L">Jose Luis Ambite</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.03437v2-abstract-short" style="display: inline;"> Federated learning (FL) enables distributed computation of machine learning models over various disparate, remote data sources, without requiring to transfer any individual data to a centralized location. This results in an improved generalizability of models and efficient scaling of computation as more sources and larger datasets are added to the federation. Nevertheless, recent membership attack… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.03437v2-abstract-full').style.display = 'inline'; document.getElementById('2108.03437v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.03437v2-abstract-full" style="display: none;"> Federated learning (FL) enables distributed computation of machine learning models over various disparate, remote data sources, without requiring to transfer any individual data to a centralized location. This results in an improved generalizability of models and efficient scaling of computation as more sources and larger datasets are added to the federation. Nevertheless, recent membership attacks show that private or sensitive personal data can sometimes be leaked or inferred when model parameters or summary statistics are shared with a central site, requiring improved security solutions. In this work, we propose a framework for secure FL using fully-homomorphic encryption (FHE). Specifically, we use the CKKS construction, an approximate, floating point compatible scheme that benefits from ciphertext packing and rescaling. In our evaluation on large-scale brain MRI datasets, we use our proposed secure FL framework to train a deep learning model to predict a person's age from distributed MRI scans, a common benchmarking task, and demonstrate that there is no degradation in the learning performance between the encrypted and non-encrypted federated models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.03437v2-abstract-full').style.display = 'none'; document.getElementById('2108.03437v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 figures, 1 algorithm</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.00745">arXiv:2107.00745</a> <span> [<a href="https://arxiv.org/pdf/2107.00745">pdf</a>, <a href="https://arxiv.org/format/2107.00745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> q-Paths: Generalizing the Geometric Annealing Path using Power Means </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Masrani%2C+V">Vaden Masrani</a>, <a href="/search/cs?searchtype=author&query=Brekelmans%2C+R">Rob Brekelmans</a>, <a href="/search/cs?searchtype=author&query=Bui%2C+T">Thang Bui</a>, <a href="/search/cs?searchtype=author&query=Nielsen%2C+F">Frank Nielsen</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Wood%2C+F">Frank Wood</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.00745v1-abstract-short" style="display: inline;"> Many common machine learning methods involve the geometric annealing path, a sequence of intermediate densities between two distributions of interest constructed using the geometric average. While alternatives such as the moment-averaging path have demonstrated performance gains in some settings, their practical applicability remains limited by exponential family endpoint assumptions and a lack of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.00745v1-abstract-full').style.display = 'inline'; document.getElementById('2107.00745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.00745v1-abstract-full" style="display: none;"> Many common machine learning methods involve the geometric annealing path, a sequence of intermediate densities between two distributions of interest constructed using the geometric average. While alternatives such as the moment-averaging path have demonstrated performance gains in some settings, their practical applicability remains limited by exponential family endpoint assumptions and a lack of closed form energy function. In this work, we introduce $q$-paths, a family of paths which is derived from a generalized notion of the mean, includes the geometric and arithmetic mixtures as special cases, and admits a simple closed form involving the deformed logarithm function from nonextensive thermodynamics. Following previous analysis of the geometric path, we interpret our $q$-paths as corresponding to a $q$-exponential family of distributions, and provide a variational representation of intermediate densities as minimizing a mixture of $伪$-divergences to the endpoints. We show that small deviations away from the geometric path yield empirical gains for Bayesian inference using Sequential Monte Carlo and generative model evaluation using Annealed Importance Sampling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.00745v1-abstract-full').style.display = 'none'; document.getElementById('2107.00745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2012.07823</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.02866">arXiv:2105.02866</a> <span> [<a href="https://arxiv.org/pdf/2105.02866">pdf</a>, <a href="https://arxiv.org/format/2105.02866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Membership Inference Attacks on Deep Regression Models for Neuroimaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Umang Gupta</a>, <a href="/search/cs?searchtype=author&query=Stripelis%2C+D">Dimitris Stripelis</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+P+K">Pradeep K. Lam</a>, <a href="/search/cs?searchtype=author&query=Thompson%2C+P+M">Paul M. Thompson</a>, <a href="/search/cs?searchtype=author&query=Ambite%2C+J+L">Jos茅 Luis Ambite</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.02866v2-abstract-short" style="display: inline;"> Ensuring the privacy of research participants is vital, even more so in healthcare environments. Deep learning approaches to neuroimaging require large datasets, and this often necessitates sharing data between multiple sites, which is antithetical to the privacy objectives. Federated learning is a commonly proposed solution to this problem. It circumvents the need for data sharing by sharing para… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.02866v2-abstract-full').style.display = 'inline'; document.getElementById('2105.02866v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.02866v2-abstract-full" style="display: none;"> Ensuring the privacy of research participants is vital, even more so in healthcare environments. Deep learning approaches to neuroimaging require large datasets, and this often necessitates sharing data between multiple sites, which is antithetical to the privacy objectives. Federated learning is a commonly proposed solution to this problem. It circumvents the need for data sharing by sharing parameters during the training process. However, we demonstrate that allowing access to parameters may leak private information even if data is never directly shared. In particular, we show that it is possible to infer if a sample was used to train the model given only access to the model prediction (black-box) or access to the model itself (white-box) and some leaked samples from the training data distribution. Such attacks are commonly referred to as Membership Inference attacks. We show realistic Membership Inference attacks on deep learning models trained for 3D neuroimaging tasks in a centralized as well as decentralized setup. We demonstrate feasible attacks on brain age prediction models (deep learning models that predict a person's age from their brain MRI scan). We correctly identified whether an MRI scan was used in model training with a 60% to over 80% success rate depending on model complexity and security assumptions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.02866v2-abstract-full').style.display = 'none'; document.getElementById('2105.02866v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at Medical Imaging with Deep Learning 2021 (MIDL 2021)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.08530">arXiv:2102.08530</a> <span> [<a href="https://arxiv.org/pdf/2102.08530">pdf</a>, <a href="https://arxiv.org/format/2102.08530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Mathematical Software">cs.MS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Fast Graph Learning with Unique Optimal Solutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abu-El-Haija%2C+S">Sami Abu-El-Haija</a>, <a href="/search/cs?searchtype=author&query=Crespi%2C+V">Valentino Crespi</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.08530v4-abstract-short" style="display: inline;"> We consider two popular Graph Representation Learning (GRL) methods: message passing for node classification and network embedding for link prediction. For each, we pick a popular model that we: (i) linearize and (ii) and switch its training objective to Frobenius norm error minimization. These simplifications can cast the training into finding the optimal parameters in closed-form. We program in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.08530v4-abstract-full').style.display = 'inline'; document.getElementById('2102.08530v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.08530v4-abstract-full" style="display: none;"> We consider two popular Graph Representation Learning (GRL) methods: message passing for node classification and network embedding for link prediction. For each, we pick a popular model that we: (i) linearize and (ii) and switch its training objective to Frobenius norm error minimization. These simplifications can cast the training into finding the optimal parameters in closed-form. We program in TensorFlow a functional form of Truncated Singular Value Decomposition (SVD), such that, we could decompose a dense matrix $\mathbf{M}$, without explicitly computing $\mathbf{M}$. We achieve competitive performance on popular GRL tasks while providing orders of magnitude speedup. We open-source our code at http://github.com/samihaija/tf-fsvd <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.08530v4-abstract-full').style.display = 'none'; document.getElementById('2102.08530v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICLR 2021 Workshop on Geometrical and Topological Representation Learning </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.04438">arXiv:2102.04438</a> <span> [<a href="https://arxiv.org/pdf/2102.04438">pdf</a>, <a href="https://arxiv.org/format/2102.04438">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Improved Brain Age Estimation with Slice-based Set Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Umang Gupta</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+P+K">Pradeep K. Lam</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Thompson%2C+P+M">Paul M. Thompson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.04438v2-abstract-short" style="display: inline;"> Deep Learning for neuroimaging data is a promising but challenging direction. The high dimensionality of 3D MRI scans makes this endeavor compute and data-intensive. Most conventional 3D neuroimaging methods use 3D-CNN-based architectures with a large number of parameters and require more time and data to train. Recently, 2D-slice-based models have received increasing attention as they have fewer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.04438v2-abstract-full').style.display = 'inline'; document.getElementById('2102.04438v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.04438v2-abstract-full" style="display: none;"> Deep Learning for neuroimaging data is a promising but challenging direction. The high dimensionality of 3D MRI scans makes this endeavor compute and data-intensive. Most conventional 3D neuroimaging methods use 3D-CNN-based architectures with a large number of parameters and require more time and data to train. Recently, 2D-slice-based models have received increasing attention as they have fewer parameters and may require fewer samples to achieve comparable performance. In this paper, we propose a new architecture for BrainAGE prediction. The proposed architecture works by encoding each 2D slice in an MRI with a deep 2D-CNN model. Next, it combines the information from these 2D-slice encodings using set networks or permutation invariant layers. Experiments on the BrainAGE prediction problem, using the UK Biobank dataset, showed that the model with the permutation invariant layers trains faster and provides better predictions compared to other state-of-the-art approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.04438v2-abstract-full').style.display = 'none'; document.getElementById('2102.04438v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at IEEE International Symposium on Biomedical Imaging 2021 (ISBI 2021). Code is available at https://git.io/JtazG</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.04350">arXiv:2102.04350</a> <span> [<a href="https://arxiv.org/pdf/2102.04350">pdf</a>, <a href="https://arxiv.org/format/2102.04350">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Graph Traversal with Tensor Functionals: A Meta-Algorithm for Scalable Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Markowitz%2C+E">Elan Markowitz</a>, <a href="/search/cs?searchtype=author&query=Balasubramanian%2C+K">Keshav Balasubramanian</a>, <a href="/search/cs?searchtype=author&query=Mirtaheri%2C+M">Mehrnoosh Mirtaheri</a>, <a href="/search/cs?searchtype=author&query=Abu-El-Haija%2C+S">Sami Abu-El-Haija</a>, <a href="/search/cs?searchtype=author&query=Perozzi%2C+B">Bryan Perozzi</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.04350v1-abstract-short" style="display: inline;"> Graph Representation Learning (GRL) methods have impacted fields from chemistry to social science. However, their algorithmic implementations are specialized to specific use-cases e.g.message passing methods are run differently from node embedding ones. Despite their apparent differences, all these methods utilize the graph structure, and therefore, their learning can be approximated with stochast… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.04350v1-abstract-full').style.display = 'inline'; document.getElementById('2102.04350v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.04350v1-abstract-full" style="display: none;"> Graph Representation Learning (GRL) methods have impacted fields from chemistry to social science. However, their algorithmic implementations are specialized to specific use-cases e.g.message passing methods are run differently from node embedding ones. Despite their apparent differences, all these methods utilize the graph structure, and therefore, their learning can be approximated with stochastic graph traversals. We propose Graph Traversal via Tensor Functionals(GTTF), a unifying meta-algorithm framework for easing the implementation of diverse graph algorithms and enabling transparent and efficient scaling to large graphs. GTTF is founded upon a data structure (stored as a sparse tensor) and a stochastic graph traversal algorithm (described using tensor operations). The algorithm is a functional that accept two functions, and can be specialized to obtain a variety of GRL models and objectives, simply by changing those two functions. We show for a wide class of methods, our algorithm learns in an unbiased fashion and, in expectation, approximates the learning as if the specialized implementations were run directly. With these capabilities, we scale otherwise non-scalable methods to set state-of-the-art on large graph datasets while being more efficient than existing GRL libraries - with only a handful of lines of code for each method specialization. GTTF and its various GRL implementations are on: https://github.com/isi-usc-edu/gttf. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.04350v1-abstract-full').style.display = 'none'; document.getElementById('2102.04350v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in ICLR 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.04108">arXiv:2101.04108</a> <span> [<a href="https://arxiv.org/pdf/2101.04108">pdf</a>, <a href="https://arxiv.org/format/2101.04108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Controllable Guarantees for Fair Outcomes via Contrastive Information Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Umang Gupta</a>, <a href="/search/cs?searchtype=author&query=Ferber%2C+A+M">Aaron M Ferber</a>, <a href="/search/cs?searchtype=author&query=Dilkina%2C+B">Bistra Dilkina</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.04108v3-abstract-short" style="display: inline;"> Controlling bias in training datasets is vital for ensuring equal treatment, or parity, between different groups in downstream applications. A naive solution is to transform the data so that it is statistically independent of group membership, but this may throw away too much information when a reasonable compromise between fairness and accuracy is desired. Another common approach is to limit the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.04108v3-abstract-full').style.display = 'inline'; document.getElementById('2101.04108v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.04108v3-abstract-full" style="display: none;"> Controlling bias in training datasets is vital for ensuring equal treatment, or parity, between different groups in downstream applications. A naive solution is to transform the data so that it is statistically independent of group membership, but this may throw away too much information when a reasonable compromise between fairness and accuracy is desired. Another common approach is to limit the ability of a particular adversary who seeks to maximize parity. Unfortunately, representations produced by adversarial approaches may still retain biases as their efficacy is tied to the complexity of the adversary used during training. To this end, we theoretically establish that by limiting the mutual information between representations and protected attributes, we can assuredly control the parity of any downstream classifier. We demonstrate an effective method for controlling parity through mutual information based on contrastive information estimators and show that they outperform approaches that rely on variational bounds based on complex generative models. We test our approach on UCI Adult and Heritage Health datasets and demonstrate that our approach provides more informative representations across a range of desired parity thresholds while providing strong theoretical guarantees on the parity of any downstream algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.04108v3-abstract-full').style.display = 'none'; document.getElementById('2101.04108v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This version fixes an error in Theorem 2 of the original manuscript that appeared at the Proceedings of the 35th AAAI Conference on Artificial Intelligence (AAAI-21). Code is available at https://github.com/umgupta/fairness-via-contrastive-estimation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.15480">arXiv:2012.15480</a> <span> [<a href="https://arxiv.org/pdf/2012.15480">pdf</a>, <a href="https://arxiv.org/format/2012.15480">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Likelihood Ratio Exponential Families </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Brekelmans%2C+R">Rob Brekelmans</a>, <a href="/search/cs?searchtype=author&query=Nielsen%2C+F">Frank Nielsen</a>, <a href="/search/cs?searchtype=author&query=Makhzani%2C+A">Alireza Makhzani</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.15480v2-abstract-short" style="display: inline;"> The exponential family is well known in machine learning and statistical physics as the maximum entropy distribution subject to a set of observed constraints, while the geometric mixture path is common in MCMC methods such as annealed importance sampling. Linking these two ideas, recent work has interpreted the geometric mixture path as an exponential family of distributions to analyze the thermod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.15480v2-abstract-full').style.display = 'inline'; document.getElementById('2012.15480v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.15480v2-abstract-full" style="display: none;"> The exponential family is well known in machine learning and statistical physics as the maximum entropy distribution subject to a set of observed constraints, while the geometric mixture path is common in MCMC methods such as annealed importance sampling. Linking these two ideas, recent work has interpreted the geometric mixture path as an exponential family of distributions to analyze the thermodynamic variational objective (TVO). We extend these likelihood ratio exponential families to include solutions to rate-distortion (RD) optimization, the information bottleneck (IB) method, and recent rate-distortion-classification approaches which combine RD and IB. This provides a common mathematical framework for understanding these methods via the conjugate duality of exponential families and hypothesis testing. Further, we collect existing results to provide a variational representation of intermediate RD or TVO distributions as a minimizing an expectation of KL divergences. This solution also corresponds to a size-power tradeoff using the likelihood ratio test and the Neyman Pearson lemma. In thermodynamic integration bounds such as the TVO, we identify the intermediate distribution whose expected sufficient statistics match the log partition function. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.15480v2-abstract-full').style.display = 'none'; document.getElementById('2012.15480v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS Workshop on Deep Learning through Information Geometry</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.07823">arXiv:2012.07823</a> <span> [<a href="https://arxiv.org/pdf/2012.07823">pdf</a>, <a href="https://arxiv.org/format/2012.07823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Annealed Importance Sampling with q-Paths </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Brekelmans%2C+R">Rob Brekelmans</a>, <a href="/search/cs?searchtype=author&query=Masrani%2C+V">Vaden Masrani</a>, <a href="/search/cs?searchtype=author&query=Bui%2C+T">Thang Bui</a>, <a href="/search/cs?searchtype=author&query=Wood%2C+F">Frank Wood</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Nielsen%2C+F">Frank Nielsen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.07823v1-abstract-short" style="display: inline;"> Annealed importance sampling (AIS) is the gold standard for estimating partition functions or marginal likelihoods, corresponding to importance sampling over a path of distributions between a tractable base and an unnormalized target. While AIS yields an unbiased estimator for any path, existing literature has been primarily limited to the geometric mixture or moment-averaged paths associated with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.07823v1-abstract-full').style.display = 'inline'; document.getElementById('2012.07823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.07823v1-abstract-full" style="display: none;"> Annealed importance sampling (AIS) is the gold standard for estimating partition functions or marginal likelihoods, corresponding to importance sampling over a path of distributions between a tractable base and an unnormalized target. While AIS yields an unbiased estimator for any path, existing literature has been primarily limited to the geometric mixture or moment-averaged paths associated with the exponential family and KL divergence. We explore AIS using $q$-paths, which include the geometric path as a special case and are related to the homogeneous power mean, deformed exponential family, and $伪$-divergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.07823v1-abstract-full').style.display = 'none'; document.getElementById('2012.07823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS Workshop on Deep Learning through Information Geometry (Best Paper Award)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Published at UAI 2021 https://arxiv.org/abs/2107.00745 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.14917">arXiv:2007.14917</a> <span> [<a href="https://arxiv.org/pdf/2007.14917">pdf</a>, <a href="https://arxiv.org/format/2007.14917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Compressing Deep Neural Networks via Layer Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Neill%2C+J+O">James O' Neill</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.14917v1-abstract-short" style="display: inline;"> This paper proposes \textit{layer fusion} - a model compression technique that discovers which weights to combine and then fuses weights of similar fully-connected, convolutional and attention layers. Layer fusion can significantly reduce the number of layers of the original network with little additional computation overhead, while maintaining competitive performance. From experiments on CIFAR-10… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.14917v1-abstract-full').style.display = 'inline'; document.getElementById('2007.14917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.14917v1-abstract-full" style="display: none;"> This paper proposes \textit{layer fusion} - a model compression technique that discovers which weights to combine and then fuses weights of similar fully-connected, convolutional and attention layers. Layer fusion can significantly reduce the number of layers of the original network with little additional computation overhead, while maintaining competitive performance. From experiments on CIFAR-10, we find that various deep convolution neural networks can remain within 2\% accuracy points of the original networks up to a compression ratio of 3.33 when iteratively retrained with layer fusion. For experiments on the WikiText-2 language modelling dataset where pretrained transformer models are used, we achieve compression that leads to a network that is 20\% of its original size while being within 5 perplexity points of the original network. We also find that other well-established compression techniques can achieve competitive performance when compared to their original networks given a sufficient number of retraining steps. Generally, we observe a clear inflection point in performance as the amount of compression increases, suggesting a bound on the amount of compression that can be achieved before an exponential degradation in performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.14917v1-abstract-full').style.display = 'none'; document.getElementById('2007.14917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.05335">arXiv:2007.05335</a> <span> [<a href="https://arxiv.org/pdf/2007.05335">pdf</a>, <a href="https://arxiv.org/format/2007.05335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Robust Classification under Class-Dependent Domain Shift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Galstyan%2C+T">Tigran Galstyan</a>, <a href="/search/cs?searchtype=author&query=Khachatrian%2C+H">Hrant Khachatrian</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.05335v1-abstract-short" style="display: inline;"> Investigation of machine learning algorithms robust to changes between the training and test distributions is an active area of research. In this paper we explore a special type of dataset shift which we call class-dependent domain shift. It is characterized by the following features: the input data causally depends on the label, the shift in the data is fully explained by a known variable, the va… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05335v1-abstract-full').style.display = 'inline'; document.getElementById('2007.05335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.05335v1-abstract-full" style="display: none;"> Investigation of machine learning algorithms robust to changes between the training and test distributions is an active area of research. In this paper we explore a special type of dataset shift which we call class-dependent domain shift. It is characterized by the following features: the input data causally depends on the label, the shift in the data is fully explained by a known variable, the variable which controls the shift can depend on the label, there is no shift in the label distribution. We define a simple optimization problem with an information theoretic constraint and attempt to solve it with neural networks. Experiments on a toy dataset demonstrate the proposed method is able to learn robust classifiers which generalize well to unseen domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05335v1-abstract-full').style.display = 'none'; document.getElementById('2007.05335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICML 2020 workshop on Uncertainty and Robustness in Deep Learning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.00642">arXiv:2007.00642</a> <span> [<a href="https://arxiv.org/pdf/2007.00642">pdf</a>, <a href="https://arxiv.org/format/2007.00642">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> All in the Exponential Family: Bregman Duality in Thermodynamic Variational Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Brekelmans%2C+R">Rob Brekelmans</a>, <a href="/search/cs?searchtype=author&query=Masrani%2C+V">Vaden Masrani</a>, <a href="/search/cs?searchtype=author&query=Wood%2C+F">Frank Wood</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.00642v1-abstract-short" style="display: inline;"> The recently proposed Thermodynamic Variational Objective (TVO) leverages thermodynamic integration to provide a family of variational inference objectives, which both tighten and generalize the ubiquitous Evidence Lower Bound (ELBO). However, the tightness of TVO bounds was not previously known, an expensive grid search was used to choose a "schedule" of intermediate distributions, and model lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.00642v1-abstract-full').style.display = 'inline'; document.getElementById('2007.00642v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.00642v1-abstract-full" style="display: none;"> The recently proposed Thermodynamic Variational Objective (TVO) leverages thermodynamic integration to provide a family of variational inference objectives, which both tighten and generalize the ubiquitous Evidence Lower Bound (ELBO). However, the tightness of TVO bounds was not previously known, an expensive grid search was used to choose a "schedule" of intermediate distributions, and model learning suffered with ostensibly tighter bounds. In this work, we propose an exponential family interpretation of the geometric mixture curve underlying the TVO and various path sampling methods, which allows us to characterize the gap in TVO likelihood bounds as a sum of KL divergences. We propose to choose intermediate distributions using equal spacing in the moment parameters of our exponential family, which matches grid search performance and allows the schedule to adaptively update over the course of training. Finally, we derive a doubly reparameterized gradient estimator which improves model learning and allows the TVO to benefit from more refined bounds. To further contextualize our contributions, we provide a unified framework for understanding thermodynamic integration and the TVO using Taylor series remainders. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.00642v1-abstract-full').style.display = 'none'; document.getElementById('2007.00642v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.00115">arXiv:2006.00115</a> <span> [<a href="https://arxiv.org/pdf/2006.00115">pdf</a>, <a href="https://arxiv.org/format/2006.00115">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Overview of Scanner Invariant Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Moyer%2C+D">Daniel Moyer</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Thompson%2C+P+M">Paul M. Thompson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.00115v1-abstract-short" style="display: inline;"> Pooled imaging data from multiple sources is subject to bias from each source. Studies that do not correct for these scanner/site biases at best lose statistical power, and at worst leave spurious correlations in their data. Estimation of the bias effects is non-trivial due to the paucity of data with correspondence across sites, so called "traveling phantom" data, which is expensive to collect. N… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.00115v1-abstract-full').style.display = 'inline'; document.getElementById('2006.00115v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.00115v1-abstract-full" style="display: none;"> Pooled imaging data from multiple sources is subject to bias from each source. Studies that do not correct for these scanner/site biases at best lose statistical power, and at worst leave spurious correlations in their data. Estimation of the bias effects is non-trivial due to the paucity of data with correspondence across sites, so called "traveling phantom" data, which is expensive to collect. Nevertheless, numerous solutions leveraging direct correspondence have been proposed. In contrast to this, Moyer et al. (2019) proposes an unsupervised solution using invariant representations, one which does not require correspondence and thus does not require paired images. By leveraging the data processing inequality, an invariant representation can then be used to create an image reconstruction that is uninformative of its original source, yet still faithful to the underlying structure. In the present abstract we provide an overview of this method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.00115v1-abstract-full').style.display = 'none'; document.getElementById('2006.00115v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a short paper in MIDL 2020. In accordance with the MIDL 2020 Call for Papers, this short paper is an overview of an already published work arXiv:1904.05375, and was submitted to MIDL in order to allow presentation and discussion at the meeting</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> MIDL/2020/ExtendedAbstract/yqm9RD_XHT </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.02515">arXiv:2005.02515</a> <span> [<a href="https://arxiv.org/pdf/2005.02515">pdf</a>, <a href="https://arxiv.org/format/2005.02515">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1613/jair.1.13610">10.1613/jair.1.13610 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Metric Space for Point Process Excitations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Marmarelis%2C+M+G">Myrl G. Marmarelis</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.02515v4-abstract-short" style="display: inline;"> A multivariate Hawkes process enables self- and cross-excitations through a triggering matrix that behaves like an asymmetrical covariance structure, characterizing pairwise interactions between the event types. Full-rank estimation of all interactions is often infeasible in empirical settings. Models that specialize on a spatiotemporal application alleviate this obstacle by exploiting spatial loc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.02515v4-abstract-full').style.display = 'inline'; document.getElementById('2005.02515v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.02515v4-abstract-full" style="display: none;"> A multivariate Hawkes process enables self- and cross-excitations through a triggering matrix that behaves like an asymmetrical covariance structure, characterizing pairwise interactions between the event types. Full-rank estimation of all interactions is often infeasible in empirical settings. Models that specialize on a spatiotemporal application alleviate this obstacle by exploiting spatial locality, allowing the dyadic relationships between events to depend only on separation in time and relative distances in real Euclidean space. Here we generalize this framework to any multivariate Hawkes process, and harness it as a vessel for embedding arbitrary event types in a hidden metric space. Specifically, we propose a Hidden Hawkes Geometry (HHG) model to uncover the hidden geometry between event excitations in a multivariate point process. The low dimensionality of the embedding regularizes the structure of the inferred interactions. We develop a number of estimators and validate the model by conducting several experiments. In particular, we investigate regional infectivity dynamics of COVID-19 in an early South Korean record and recent Los Angeles confirmed cases. By additionally performing synthetic experiments on short records as well as explorations into options markets and the Ebola epidemic, we demonstrate that learning the embedding alongside a point process uncovers salient interactions in a broad range of applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.02515v4-abstract-full').style.display = 'none'; document.getElementById('2005.02515v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Journal of Artificial Intelligence Research 73 (2022) 1323-1353 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.07933">arXiv:2002.07933</a> <span> [<a href="https://arxiv.org/pdf/2002.07933">pdf</a>, <a href="https://arxiv.org/format/2002.07933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Improving Generalization by Controlling Label-Noise Information in Neural Network Weights </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+H">Hrayr Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Reing%2C+K">Kyle Reing</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.07933v2-abstract-short" style="display: inline;"> In the presence of noisy or incorrect labels, neural networks have the undesirable tendency to memorize information about the noise. Standard regularization techniques such as dropout, weight decay or data augmentation sometimes help, but do not prevent this behavior. If one considers neural network weights as random variables that depend on the data and stochasticity of training, the amount of me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.07933v2-abstract-full').style.display = 'inline'; document.getElementById('2002.07933v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.07933v2-abstract-full" style="display: none;"> In the presence of noisy or incorrect labels, neural networks have the undesirable tendency to memorize information about the noise. Standard regularization techniques such as dropout, weight decay or data augmentation sometimes help, but do not prevent this behavior. If one considers neural network weights as random variables that depend on the data and stochasticity of training, the amount of memorized information can be quantified with the Shannon mutual information between weights and the vector of all training labels given inputs, $I(w ; \mathbf{y} \mid \mathbf{x})$. We show that for any training algorithm, low values of this term correspond to reduction in memorization of label-noise and better generalization bounds. To obtain these low values, we propose training algorithms that employ an auxiliary network that predicts gradients in the final layers of a classifier without accessing labels. We illustrate the effectiveness of our approach on versions of MNIST, CIFAR-10, and CIFAR-100 corrupted with various noise models, and on a large-scale dataset Clothing1M that has noisy labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.07933v2-abstract-full').style.display = 'none'; document.getElementById('2002.07933v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML, 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.00646">arXiv:1912.00646</a> <span> [<a href="https://arxiv.org/pdf/1912.00646">pdf</a>, <a href="https://arxiv.org/format/1912.00646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Discovery and Separation of Features for Invariant Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jaiswal%2C+A">Ayush Jaiswal</a>, <a href="/search/cs?searchtype=author&query=Brekelmans%2C+R">Rob Brekelmans</a>, <a href="/search/cs?searchtype=author&query=Moyer%2C+D">Daniel Moyer</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=AbdAlmageed%2C+W">Wael AbdAlmageed</a>, <a href="/search/cs?searchtype=author&query=Natarajan%2C+P">Premkumar Natarajan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.00646v1-abstract-short" style="display: inline;"> Supervised machine learning models often associate irrelevant nuisance factors with the prediction target, which hurts generalization. We propose a framework for training robust neural networks that induces invariance to nuisances through learning to discover and separate predictive and nuisance factors of data. We present an information theoretic formulation of our approach, from which we derive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.00646v1-abstract-full').style.display = 'inline'; document.getElementById('1912.00646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.00646v1-abstract-full" style="display: none;"> Supervised machine learning models often associate irrelevant nuisance factors with the prediction target, which hurts generalization. We propose a framework for training robust neural networks that induces invariance to nuisances through learning to discover and separate predictive and nuisance factors of data. We present an information theoretic formulation of our approach, from which we derive training objectives and its connections with previous methods. Empirical results on a wide array of datasets show that the proposed framework achieves state-of-the-art performance, without requiring nuisance annotations during training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.00646v1-abstract-full').style.display = 'none'; document.getElementById('1912.00646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.04060">arXiv:1911.04060</a> <span> [<a href="https://arxiv.org/pdf/1911.04060">pdf</a>, <a href="https://arxiv.org/format/1911.04060">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Invariant Representations through Adversarial Forgetting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jaiswal%2C+A">Ayush Jaiswal</a>, <a href="/search/cs?searchtype=author&query=Moyer%2C+D">Daniel Moyer</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=AbdAlmageed%2C+W">Wael AbdAlmageed</a>, <a href="/search/cs?searchtype=author&query=Natarajan%2C+P">Premkumar Natarajan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.04060v2-abstract-short" style="display: inline;"> We propose a novel approach to achieving invariance for deep neural networks in the form of inducing amnesia to unwanted factors of data through a new adversarial forgetting mechanism. We show that the forgetting mechanism serves as an information-bottleneck, which is manipulated by the adversarial training to learn invariance to unwanted factors. Empirical results show that the proposed framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.04060v2-abstract-full').style.display = 'inline'; document.getElementById('1911.04060v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.04060v2-abstract-full" style="display: none;"> We propose a novel approach to achieving invariance for deep neural networks in the form of inducing amnesia to unwanted factors of data through a new adversarial forgetting mechanism. We show that the forgetting mechanism serves as an information-bottleneck, which is manipulated by the adversarial training to learn invariance to unwanted factors. Empirical results show that the proposed framework achieves state-of-the-art performance at learning invariance in both nuisance and bias settings on a diverse collection of datasets and tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.04060v2-abstract-full').style.display = 'none'; document.getElementById('1911.04060v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in Proceedings of the 34th AAAI Conference on Artificial Intelligence (AAAI-20)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.03881">arXiv:1909.03881</a> <span> [<a href="https://arxiv.org/pdf/1909.03881">pdf</a>, <a href="https://arxiv.org/format/1909.03881">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Nearly-Unsupervised Hashcode Representations for Relation Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sahil Garg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Cecchi%2C+G">Guillermo Cecchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.03881v1-abstract-short" style="display: inline;"> Recently, kernelized locality sensitive hashcodes have been successfully employed as representations of natural language text, especially showing high relevance to biomedical relation extraction tasks. In this paper, we propose to optimize the hashcode representations in a nearly unsupervised manner, in which we only use data points, but not their class labels, for learning. The optimized hashcode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.03881v1-abstract-full').style.display = 'inline'; document.getElementById('1909.03881v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.03881v1-abstract-full" style="display: none;"> Recently, kernelized locality sensitive hashcodes have been successfully employed as representations of natural language text, especially showing high relevance to biomedical relation extraction tasks. In this paper, we propose to optimize the hashcode representations in a nearly unsupervised manner, in which we only use data points, but not their class labels, for learning. The optimized hashcode representations are then fed to a supervised classifier following the prior work. This nearly unsupervised approach allows fine-grained optimization of each hash function, which is particularly suitable for building hashcode representations generalizing from a training set to a test set. We empirically evaluate the proposed approach for biomedical relation extraction tasks, obtaining significant accuracy improvements w.r.t. state-of-the-art supervised and semi-supervised approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.03881v1-abstract-full').style.display = 'none'; document.getElementById('1909.03881v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proceedings of EMNLP-19</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.13276">arXiv:1905.13276</a> <span> [<a href="https://arxiv.org/pdf/1905.13276">pdf</a>, <a href="https://arxiv.org/format/1905.13276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Efficient Covariance Estimation from Temporal Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+H">Hrayr Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Moyer%2C+D">Daniel Moyer</a>, <a href="/search/cs?searchtype=author&query=Khachatrian%2C+H">Hrant Khachatrian</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.13276v2-abstract-short" style="display: inline;"> Estimating the covariance structure of multivariate time series is a fundamental problem with a wide-range of real-world applications -- from financial modeling to fMRI analysis. Despite significant recent advances, current state-of-the-art methods are still severely limited in terms of scalability, and do not work well in high-dimensional undersampled regimes. In this work we propose a novel meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.13276v2-abstract-full').style.display = 'inline'; document.getElementById('1905.13276v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.13276v2-abstract-full" style="display: none;"> Estimating the covariance structure of multivariate time series is a fundamental problem with a wide-range of real-world applications -- from financial modeling to fMRI analysis. Despite significant recent advances, current state-of-the-art methods are still severely limited in terms of scalability, and do not work well in high-dimensional undersampled regimes. In this work we propose a novel method called Temporal Correlation Explanation, or T-CorEx, that (a) has linear time and memory complexity with respect to the number of variables, and can scale to very large temporal datasets that are not tractable with existing methods; (b) gives state-of-the-art results in highly undersampled regimes on both synthetic and real-world datasets; and (c) makes minimal assumptions about the character of the dynamics of the system. T-CorEx optimizes an information-theoretic objective function to learn a latent factor graphical model for each time period and applies two regularization techniques to induce temporal consistency of estimates. We perform extensive evaluation of T-Corex using both synthetic and real-world data and demonstrate that it can be used for detecting sudden changes in the underlying covariance matrix, capturing transient correlations and analyzing extremely high-dimensional complex multivariate time series such as high-resolution fMRI data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.13276v2-abstract-full').style.display = 'none'; document.getElementById('1905.13276v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.00067">arXiv:1905.00067</a> <span> [<a href="https://arxiv.org/pdf/1905.00067">pdf</a>, <a href="https://arxiv.org/format/1905.00067">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> MixHop: Higher-Order Graph Convolutional Architectures via Sparsified Neighborhood Mixing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abu-El-Haija%2C+S">Sami Abu-El-Haija</a>, <a href="/search/cs?searchtype=author&query=Perozzi%2C+B">Bryan Perozzi</a>, <a href="/search/cs?searchtype=author&query=Kapoor%2C+A">Amol Kapoor</a>, <a href="/search/cs?searchtype=author&query=Alipourfard%2C+N">Nazanin Alipourfard</a>, <a href="/search/cs?searchtype=author&query=Lerman%2C+K">Kristina Lerman</a>, <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+H">Hrayr Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.00067v3-abstract-short" style="display: inline;"> Existing popular methods for semi-supervised learning with Graph Neural Networks (such as the Graph Convolutional Network) provably cannot learn a general class of neighborhood mixing relationships. To address this weakness, we propose a new model, MixHop, that can learn these relationships, including difference operators, by repeatedly mixing feature representations of neighbors at various distan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.00067v3-abstract-full').style.display = 'inline'; document.getElementById('1905.00067v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.00067v3-abstract-full" style="display: none;"> Existing popular methods for semi-supervised learning with Graph Neural Networks (such as the Graph Convolutional Network) provably cannot learn a general class of neighborhood mixing relationships. To address this weakness, we propose a new model, MixHop, that can learn these relationships, including difference operators, by repeatedly mixing feature representations of neighbors at various distances. Mixhop requires no additional memory or computational complexity, and outperforms on challenging baselines. In addition, we propose sparsity regularization that allows us to visualize how the network prioritizes neighborhood information across different graph datasets. Our analysis of the learned architectures reveals that neighborhood mixing varies per datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.00067v3-abstract-full').style.display = 'none'; document.getElementById('1905.00067v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.07199">arXiv:1904.07199</a> <span> [<a href="https://arxiv.org/pdf/1904.07199">pdf</a>, <a href="https://arxiv.org/format/1904.07199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Exact Rate-Distortion in Autoencoders via Echo Noise </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Brekelmans%2C+R">Rob Brekelmans</a>, <a href="/search/cs?searchtype=author&query=Moyer%2C+D">Daniel Moyer</a>, <a href="/search/cs?searchtype=author&query=Galstyan%2C+A">Aram Galstyan</a>, <a href="/search/cs?searchtype=author&query=Steeg%2C+G+V">Greg Ver Steeg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.07199v3-abstract-short" style="display: inline;"> Compression is at the heart of effective representation learning. However, lossy compression is typically achieved through simple parametric models like Gaussian noise to preserve analytic tractability, and the limitations this imposes on learning are largely unexplored. Further, the Gaussian prior assumptions in models such as variational autoencoders (VAEs) provide only an upper bound on the com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.07199v3-abstract-full').style.display = 'inline'; document.getElementById('1904.07199v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.07199v3-abstract-full" style="display: none;"> Compression is at the heart of effective representation learning. However, lossy compression is typically achieved through simple parametric models like Gaussian noise to preserve analytic tractability, and the limitations this imposes on learning are largely unexplored. Further, the Gaussian prior assumptions in models such as variational autoencoders (VAEs) provide only an upper bound on the compression rate in general. We introduce a new noise channel, \emph{Echo noise}, that admits a simple, exact expression for mutual information for arbitrary input distributions. The noise is constructed in a data-driven fashion that does not require restrictive distributional assumptions. With its complex encoding mechanism and exact rate regularization, Echo leads to improved bounds on log-likelihood and dominates $尾$-VAEs across the achievable range of rate-distortion trade-offs. Further, we show that Echo noise can outperform flow-based methods without the need to train additional distributional transformations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.07199v3-abstract-full').style.display = 'none'; document.getElementById('1904.07199v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2019; updated Gaussian baseline results, added disentanglement</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Steeg%2C+G+V&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Steeg%2C+G+V&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Steeg%2C+G+V&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository