CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 74 results for author: <span class="mathjax">Brooks, D</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Brooks%2C+D">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Brooks, D"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Brooks%2C+D&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Brooks, D"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Brooks%2C+D&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Brooks%2C+D&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Brooks%2C+D&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13858">arXiv:2405.13858</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.13858">pdf</a>, <a href="https://arxiv.org/format/2405.13858">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Carbon Connect: An Ecosystem for Sustainable Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+B+C">Benjamin C. Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=van+Benthem%2C+A">Arthur van Benthem</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Hills%2C+G">Gage Hills</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+V">Vincent Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Pierce%2C+B">Benjamin Pierce</a>, <a href="/search/cs?searchtype=author&amp;query=Stewart%2C+C">Christopher Stewart</a>, <a href="/search/cs?searchtype=author&amp;query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Wierman%2C+A">Adam Wierman</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+M">Minlan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13858v2-abstract-short" style="display: inline;"> Computing is at a moment of profound opportunity. Emerging applications -- such as capable artificial intelligence, immersive virtual realities, and pervasive sensor systems -- drive unprecedented demand for computer. Despite recent advances toward net zero carbon emissions, the computing industry&#39;s gross energy usage continues to rise at an alarming rate, outpacing the growth of new energy instal&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13858v2-abstract-full').style.display = 'inline'; document.getElementById('2405.13858v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13858v2-abstract-full" style="display: none;"> Computing is at a moment of profound opportunity. Emerging applications -- such as capable artificial intelligence, immersive virtual realities, and pervasive sensor systems -- drive unprecedented demand for computer. Despite recent advances toward net zero carbon emissions, the computing industry&#39;s gross energy usage continues to rise at an alarming rate, outpacing the growth of new energy installations and renewable energy deployments. A shift towards sustainability is needed to spark a transformation in how computer systems are manufactured, allocated, and consumed. Carbon Connect envisions coordinated research thrusts that produce design and management strategies for sustainable, next-generation computer systems. These strategies must flatten and then reverse growth trajectories for computing power and carbon for society&#39;s most rapidly growing applications such as artificial intelligence and virtual spaces. We will require accurate models for carbon accounting in computing technology. For embodied carbon, we must re-think conventional design strategies -- over-provisioned monolithic servers, frequent hardware refresh cycles, custom silicon -- and adopt life-cycle design strategies that more effectively reduce, reuse and recycle hardware at scale. For operational carbon, we must not only embrace renewable energy but also design systems to use that energy more efficiently. Finally, new hardware design and management strategies must be cognizant of economic policy and regulatory landscape, aligning private initiatives with societal goals. Many of these broader goals will require computer scientists to develop deep, enduring collaborations with researchers in economics, law, and industrial ecology to spark change in broader practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13858v2-abstract-full').style.display = 'none'; document.getElementById('2405.13858v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02803">arXiv:2405.02803</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.02803">pdf</a>, <a href="https://arxiv.org/format/2405.02803">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Is Flash Attention Stable? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Golden%2C+A">Alicia Golden</a>, <a href="/search/cs?searchtype=author&amp;query=Hsia%2C+S">Samuel Hsia</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+F">Fei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Acun%2C+B">Bilge Acun</a>, <a href="/search/cs?searchtype=author&amp;query=Hosmer%2C+B">Basil Hosmer</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+Y">Yejin Lee</a>, <a href="/search/cs?searchtype=author&amp;query=DeVito%2C+Z">Zachary DeVito</a>, <a href="/search/cs?searchtype=author&amp;query=Johnson%2C+J">Jeff Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02803v1-abstract-short" style="display: inline;"> Training large-scale machine learning models poses distinct system challenges, given both the size and complexity of today&#39;s workloads. Recently, many organizations training state-of-the-art Generative AI models have reported cases of instability during training, often taking the form of loss spikes. Numeric deviation has emerged as a potential cause of this training instability, although quantify&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02803v1-abstract-full').style.display = 'inline'; document.getElementById('2405.02803v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02803v1-abstract-full" style="display: none;"> Training large-scale machine learning models poses distinct system challenges, given both the size and complexity of today&#39;s workloads. Recently, many organizations training state-of-the-art Generative AI models have reported cases of instability during training, often taking the form of loss spikes. Numeric deviation has emerged as a potential cause of this training instability, although quantifying this is especially challenging given the costly nature of training runs. In this work, we develop a principled approach to understanding the effects of numeric deviation, and construct proxies to put observations into context when downstream effects are difficult to quantify. As a case study, we apply this framework to analyze the widely-adopted Flash Attention optimization. We find that Flash Attention sees roughly an order of magnitude more numeric deviation as compared to Baseline Attention at BF16 when measured during an isolated forward pass. We then use a data-driven analysis based on the Wasserstein Distance to provide upper bounds on how this numeric deviation impacts model weights during training, finding that the numerical deviation present in Flash Attention is 2-5 times less significant than low-precision training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02803v1-abstract-full').style.display = 'none'; document.getElementById('2405.02803v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.13513">arXiv:2402.13513</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.13513">pdf</a>, <a href="https://arxiv.org/format/2402.13513">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Guac: Energy-Aware and SSA-Based Generation of Coarse-Grained Merged Accelerators from LLVM-IR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Brumar%2C+I">Iulian Brumar</a>, <a href="/search/cs?searchtype=author&amp;query=Rocha%2C+R">Rodrigo Rocha</a>, <a href="/search/cs?searchtype=author&amp;query=Bernat%2C+A">Alex Bernat</a>, <a href="/search/cs?searchtype=author&amp;query=Tripathy%2C+D">Devashree Tripathy</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.13513v1-abstract-short" style="display: inline;"> Designing accelerators for resource- and power-constrained applications is a daunting task. High-level Synthesis (HLS) addresses these constraints through resource sharing, an optimization at the HLS binding stage that maps multiple operations to the same functional unit. However, resource sharing is often limited to reusing instructions within a basic block. Instead of searching globally for th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13513v1-abstract-full').style.display = 'inline'; document.getElementById('2402.13513v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.13513v1-abstract-full" style="display: none;"> Designing accelerators for resource- and power-constrained applications is a daunting task. High-level Synthesis (HLS) addresses these constraints through resource sharing, an optimization at the HLS binding stage that maps multiple operations to the same functional unit. However, resource sharing is often limited to reusing instructions within a basic block. Instead of searching globally for the best control and dataflow graphs (CDFGs) to combine, it is constrained by existing instruction mappings and schedules. Coarse-grained function merging (CGFM) at the intermediate representation (IR) level can reuse control and dataflow patterns without dealing with the post-scheduling complexity of mapping operations onto functional units, wires, and registers. The merged functions produced by CGFM can be translated to RTL by HLS, yielding Coarse Grained Merged Accelerators (CGMAs). CGMAs are especially profitable across applications with similar data- and control-flow patterns. Prior work has used CGFM to generate CGMAs without regard for which CGFM algorithms best optimize area, power, and energy costs. We propose Guac, an energy-aware and SSA-based (static single assignment) CGMA generation methodology. Guac implements a novel ensemble of cost models for efficient CGMA generation. We also show that CGFM algorithms using SSA form to merge control- and dataflow graphs outperform prior non-SSA CGFM designs. We demonstrate significant area, power, and energy savings with respect to the state of the art. In particular, Guac more than doubles energy savings with respect to the closest related work while using a strong resource-sharing baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13513v1-abstract-full').style.display = 'none'; document.getElementById('2402.13513v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05893">arXiv:2402.05893</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.05893">pdf</a>, <a href="https://arxiv.org/format/2402.05893">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Personalizing Driver Safety Interfaces via Driver Cognitive Factors Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sumner%2C+E+S">Emily S Sumner</a>, <a href="/search/cs?searchtype=author&amp;query=DeCastro%2C+J">Jonathan DeCastro</a>, <a href="/search/cs?searchtype=author&amp;query=Costa%2C+J">Jean Costa</a>, <a href="/search/cs?searchtype=author&amp;query=Gopinath%2C+D+E">Deepak E Gopinath</a>, <a href="/search/cs?searchtype=author&amp;query=Kimani%2C+E">Everlyne Kimani</a>, <a href="/search/cs?searchtype=author&amp;query=Hakimi%2C+S">Shabnam Hakimi</a>, <a href="/search/cs?searchtype=author&amp;query=Morgan%2C+A">Allison Morgan</a>, <a href="/search/cs?searchtype=author&amp;query=Best%2C+A">Andrew Best</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+H">Hieu Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D+J">Daniel J Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Haq%2C+B+u">Bassam ul Haq</a>, <a href="/search/cs?searchtype=author&amp;query=Patrikalakis%2C+A">Andrew Patrikalakis</a>, <a href="/search/cs?searchtype=author&amp;query=Yasuda%2C+H">Hiroshi Yasuda</a>, <a href="/search/cs?searchtype=author&amp;query=Sieck%2C+K">Kate Sieck</a>, <a href="/search/cs?searchtype=author&amp;query=Balachandran%2C+A">Avinash Balachandran</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tiffany Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Rosman%2C+G">Guy Rosman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05893v1-abstract-short" style="display: inline;"> Recent advances in AI and intelligent vehicle technology hold promise to revolutionize mobility and transportation, in the form of advanced driving assistance (ADAS) interfaces. Although it is widely recognized that certain cognitive factors, such as impulsivity and inhibitory control, are related to risky driving behavior, play a significant role in on-road risk-taking, existing systems fail to l&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05893v1-abstract-full').style.display = 'inline'; document.getElementById('2402.05893v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05893v1-abstract-full" style="display: none;"> Recent advances in AI and intelligent vehicle technology hold promise to revolutionize mobility and transportation, in the form of advanced driving assistance (ADAS) interfaces. Although it is widely recognized that certain cognitive factors, such as impulsivity and inhibitory control, are related to risky driving behavior, play a significant role in on-road risk-taking, existing systems fail to leverage such factors. Varying levels of these cognitive factors could influence the effectiveness and acceptance of driver safety interfaces. We demonstrate an approach for personalizing driver interaction via driver safety interfaces that are triggered based on a learned recurrent neural network. The network is trained from a population of human drivers to infer impulsivity and inhibitory control from recent driving behavior. Using a high-fidelity vehicle motion simulator, we demonstrate the ability to deduce these factors from driver behavior. We then use these inferred factors to make instantaneous determinations on whether or not to engage a driver safety interface. This interface aims to decrease a driver&#39;s speed during yellow lights and reduce their inclination to run through them. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05893v1-abstract-full').style.display = 'none'; document.getElementById('2402.05893v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.16732">arXiv:2401.16732</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.16732">pdf</a>, <a href="https://arxiv.org/format/2401.16732">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Flash: A Hybrid Private Inference Protocol for Deep CNNs with High Accuracy and Low Latency on CPU </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Roh%2C+H">Hyeri Roh</a>, <a href="/search/cs?searchtype=author&amp;query=Yeo%2C+J">Jinsu Yeo</a>, <a href="/search/cs?searchtype=author&amp;query=Ko%2C+Y">Yeongil Ko</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+W">Woo-Seok Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.16732v1-abstract-short" style="display: inline;"> This paper presents Flash, an optimized private inference (PI) hybrid protocol utilizing both homomorphic encryption (HE) and secure two-party computation (2PC), which can reduce the end-to-end PI latency for deep CNN models less than 1 minute with CPU. To this end, first, Flash proposes a low-latency convolution algorithm built upon a fast slot rotation operation and a novel data encoding scheme,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16732v1-abstract-full').style.display = 'inline'; document.getElementById('2401.16732v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.16732v1-abstract-full" style="display: none;"> This paper presents Flash, an optimized private inference (PI) hybrid protocol utilizing both homomorphic encryption (HE) and secure two-party computation (2PC), which can reduce the end-to-end PI latency for deep CNN models less than 1 minute with CPU. To this end, first, Flash proposes a low-latency convolution algorithm built upon a fast slot rotation operation and a novel data encoding scheme, which results in 4-94x performance gain over the state-of-the-art. Second, to minimize the communication cost introduced by the standard nonlinear activation function ReLU, Flash replaces the entire ReLUs with the polynomial $x^2+x$ and trains deep CNN models with the new activation function. The trained models improve the inference accuracy for CIFAR-10/100 and TinyImageNet by 16% on average (up to 40% for ResNet-32) compared to prior art. Last, Flash proposes an efficient 2PC-based $x^2+x$ evaluation protocol that does not require any offline communication and that reduces the total communication cost to process the activation layer by 84-196x over the state-of-the-art. As a result, the end-to-end PI latency of Flash implemented on CPU is 0.02 minute for CIFAR-100 and 0.57 minute for TinyImageNet classification, while the total data communication is 0.07GB for CIFAR-100 and 0.22GB for TinyImageNet. Flash improves the state-of-the-art PI by 16-45x in latency and 84-196x in communication cost. Moreover, even for ImageNet, Flash can deliver the latency less than 1 minute on CPU with the total communication less than 1GB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16732v1-abstract-full').style.display = 'none'; document.getElementById('2401.16732v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.14385">arXiv:2312.14385</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.14385">pdf</a>, <a href="https://arxiv.org/format/2312.14385">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Generative AI Beyond LLMs: System Implications of Multi-Modal Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Golden%2C+A">Alicia Golden</a>, <a href="/search/cs?searchtype=author&amp;query=Hsia%2C+S">Samuel Hsia</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+F">Fei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Acun%2C+B">Bilge Acun</a>, <a href="/search/cs?searchtype=author&amp;query=Hosmer%2C+B">Basil Hosmer</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+Y">Yejin Lee</a>, <a href="/search/cs?searchtype=author&amp;query=DeVito%2C+Z">Zachary DeVito</a>, <a href="/search/cs?searchtype=author&amp;query=Johnson%2C+J">Jeff Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.14385v2-abstract-short" style="display: inline;"> As the development of large-scale Generative AI models evolve beyond text (1D) generation to include image (2D) and video (3D) generation, processing spatial and temporal information presents unique challenges to quality, performance, and efficiency. We present the first work towards understanding this new system design space for multi-modal text-to-image (TTI) and text-to-video (TTV) generation m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.14385v2-abstract-full').style.display = 'inline'; document.getElementById('2312.14385v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.14385v2-abstract-full" style="display: none;"> As the development of large-scale Generative AI models evolve beyond text (1D) generation to include image (2D) and video (3D) generation, processing spatial and temporal information presents unique challenges to quality, performance, and efficiency. We present the first work towards understanding this new system design space for multi-modal text-to-image (TTI) and text-to-video (TTV) generation models. Current model architecture designs are bifurcated into 2 categories: Diffusion- and Transformer-based models. Our systematic performance characterization on a suite of eight representative TTI/TTV models shows that after state-of-the-art optimization techniques such as Flash Attention are applied, Convolution accounts for up to 44% of execution time for Diffusion-based TTI models, while Linear layers consume up to 49% of execution time for Transformer-based models. We additionally observe that Diffusion-based TTI models resemble the Prefill stage of LLM inference, and benefit from 1.1-2.5x greater speedup from Flash Attention than Transformer-based TTI models that resemble the Decode phase. Since optimizations designed for LLMs do not map directly onto TTI/TTV models, we must conduct a thorough characterization of these workloads to gain insights for new optimization opportunities. In doing so, we define sequence length in the context of TTI/TTV models and observe sequence length can vary up to 4x in Diffusion model inference. We additionally observe temporal aspects of TTV workloads pose unique system bottlenecks, with Temporal Attention accounting for over 60% of total Attention time. Overall, our in-depth system performance characterization is a critical first step towards designing efficient and deployable systems for emerging TTI/TTV workloads. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.14385v2-abstract-full').style.display = 'none'; document.getElementById('2312.14385v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at 2024 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.14062">arXiv:2311.14062</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.14062">pdf</a>, <a href="https://arxiv.org/format/2311.14062">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hardware Resilience Properties of Text-Guided Image Classifiers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wasim%2C+S+T">Syed Talal Wasim</a>, <a href="/search/cs?searchtype=author&amp;query=Soboka%2C+K+H">Kabila Haile Soboka</a>, <a href="/search/cs?searchtype=author&amp;query=Mahmoud%2C+A">Abdulrahman Mahmoud</a>, <a href="/search/cs?searchtype=author&amp;query=Khan%2C+S">Salman Khan</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.14062v2-abstract-short" style="display: inline;"> This paper presents a novel method to enhance the reliability of image classification models during deployment in the face of transient hardware errors. By utilizing enriched text embeddings derived from GPT-3 with question prompts per class and CLIP pretrained text encoder, we investigate their impact as an initialization for the classification layer. Our approach achieves a remarkable&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14062v2-abstract-full').style.display = 'inline'; document.getElementById('2311.14062v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.14062v2-abstract-full" style="display: none;"> This paper presents a novel method to enhance the reliability of image classification models during deployment in the face of transient hardware errors. By utilizing enriched text embeddings derived from GPT-3 with question prompts per class and CLIP pretrained text encoder, we investigate their impact as an initialization for the classification layer. Our approach achieves a remarkable $5.5\times$ average increase in hardware reliability (and up to $14\times$) across various architectures in the most critical layer, with minimal accuracy drop ($0.3\%$ on average) compared to baseline PyTorch models. Furthermore, our method seamlessly integrates with any image classification backbone, showcases results across various network architectures, decreases parameter and FLOPs overhead, and follows a consistent training recipe. This research offers a practical and efficient solution to bolster the robustness of image classification models against hardware failures, with potential implications for future studies in this domain. Our code and models are released at https://github.com/TalalWasim/TextGuidedResilience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14062v2-abstract-full').style.display = 'none'; document.getElementById('2311.14062v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.08589">arXiv:2311.08589</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.08589">pdf</a>, <a href="https://arxiv.org/format/2311.08589">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Carbon Responder: Coordinating Demand Response for the Datacenter Fleet </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xing%2C+J">Jiali Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Acun%2C+B">Bilge Acun</a>, <a href="/search/cs?searchtype=author&amp;query=Sundarrajan%2C+A">Aditya Sundarrajan</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Chakkaravarthy%2C+M">Manoj Chakkaravarthy</a>, <a href="/search/cs?searchtype=author&amp;query=Avila%2C+N">Nikky Avila</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+B+C">Benjamin C. Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.08589v1-abstract-short" style="display: inline;"> The increasing integration of renewable energy sources results in fluctuations in carbon intensity throughout the day. To mitigate their carbon footprint, datacenters can implement demand response (DR) by adjusting their load based on grid signals. However, this presents challenges for private datacenters with diverse workloads and services. One of the key challenges is efficiently and fairly allo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08589v1-abstract-full').style.display = 'inline'; document.getElementById('2311.08589v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.08589v1-abstract-full" style="display: none;"> The increasing integration of renewable energy sources results in fluctuations in carbon intensity throughout the day. To mitigate their carbon footprint, datacenters can implement demand response (DR) by adjusting their load based on grid signals. However, this presents challenges for private datacenters with diverse workloads and services. One of the key challenges is efficiently and fairly allocating power curtailment across different workloads. In response to these challenges, we propose the Carbon Responder framework. The Carbon Responder framework aims to reduce the carbon footprint of heterogeneous workloads in datacenters by modulating their power usage. Unlike previous studies, Carbon Responder considers both online and batch workloads with different service level objectives and develops accurate performance models to achieve performance-aware power allocation. The framework supports three alternative policies: Efficient DR, Fair and Centralized DR, and Fair and Decentralized DR. We evaluate Carbon Responder polices using production workload traces from a private hyperscale datacenter. Our experimental results demonstrate that the efficient Carbon Responder policy reduces the carbon footprint by around 2x as much compared to baseline approaches adapted from existing methods. The fair Carbon Responder policies distribute the performance penalties and carbon reduction responsibility fairly among workloads. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08589v1-abstract-full').style.display = 'none'; document.getElementById('2311.08589v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.02784">arXiv:2310.02784</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.02784">pdf</a>, <a href="https://arxiv.org/format/2310.02784">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MAD Max Beyond Single-Node: Enabling Large Machine Learning Model Acceleration on Distributed Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsia%2C+S">Samuel Hsia</a>, <a href="/search/cs?searchtype=author&amp;query=Golden%2C+A">Alicia Golden</a>, <a href="/search/cs?searchtype=author&amp;query=Acun%2C+B">Bilge Acun</a>, <a href="/search/cs?searchtype=author&amp;query=Ardalani%2C+N">Newsha Ardalani</a>, <a href="/search/cs?searchtype=author&amp;query=DeVito%2C+Z">Zachary DeVito</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.02784v3-abstract-short" style="display: inline;"> Training and deploying large-scale machine learning models is time-consuming, requires significant distributed computing infrastructures, and incurs high operational costs. Our analysis, grounded in real-world large model training on datacenter-scale infrastructures, reveals that 14~32% of all GPU hours are spent on communication with no overlapping computation. To minimize this outstanding commun&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02784v3-abstract-full').style.display = 'inline'; document.getElementById('2310.02784v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.02784v3-abstract-full" style="display: none;"> Training and deploying large-scale machine learning models is time-consuming, requires significant distributed computing infrastructures, and incurs high operational costs. Our analysis, grounded in real-world large model training on datacenter-scale infrastructures, reveals that 14~32% of all GPU hours are spent on communication with no overlapping computation. To minimize this outstanding communication latency and other inherent at-scale inefficiencies, we introduce an agile performance modeling framework, MAD-Max. This framework is designed to optimize parallelization strategies and facilitate hardware-software co-design opportunities. Through the application of MAD-Max to a suite of real-world large-scale ML models on state-of-the-art GPU clusters, we showcase potential throughput enhancements of up to 2.24x for pre-training and up to 5.2x for inference scenarios, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02784v3-abstract-full').style.display = 'none'; document.getElementById('2310.02784v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ISCA 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14396">arXiv:2309.14396</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.14396">pdf</a>, <a href="https://arxiv.org/format/2309.14396">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Guess &amp; Sketch: Language Model Guided Transpilation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+C">Celine Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Mahmoud%2C+A">Abdulrahman Mahmoud</a>, <a href="/search/cs?searchtype=author&amp;query=Kurek%2C+M">Michal Kurek</a>, <a href="/search/cs?searchtype=author&amp;query=Campanoni%2C+S">Simone Campanoni</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Chong%2C+S">Stephen Chong</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14396v2-abstract-short" style="display: inline;"> Maintaining legacy software requires many software and systems engineering hours. Assembly code programs, which demand low-level control over the computer machine state and have no variable names, are particularly difficult for humans to analyze. Existing conventional program translators guarantee correctness, but are hand-engineered for the source and target programming languages in question. Lea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14396v2-abstract-full').style.display = 'inline'; document.getElementById('2309.14396v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14396v2-abstract-full" style="display: none;"> Maintaining legacy software requires many software and systems engineering hours. Assembly code programs, which demand low-level control over the computer machine state and have no variable names, are particularly difficult for humans to analyze. Existing conventional program translators guarantee correctness, but are hand-engineered for the source and target programming languages in question. Learned transpilation, i.e. automatic translation of code, offers an alternative to manual re-writing and engineering efforts. Automated symbolic program translation approaches guarantee correctness but struggle to scale to longer programs due to the exponentially large search space. Their rigid rule-based systems also limit their expressivity, so they can only reason about a reduced space of programs. Probabilistic neural language models (LMs) produce plausible outputs for every input, but do so at the cost of guaranteed correctness. In this work, we leverage the strengths of LMs and symbolic solvers in a neurosymbolic approach to learned transpilation for assembly code. Assembly code is an appropriate setting for a neurosymbolic approach, since assembly code can be divided into shorter non-branching basic blocks amenable to the use of symbolic methods. Guess &amp; Sketch extracts alignment and confidence information from features of the LM then passes it to a symbolic solver to resolve semantic equivalence of the transpilation input and output. We test Guess &amp; Sketch on three different test sets of assembly transpilation tasks, varying in difficulty, and show that it successfully transpiles 57.6% more examples than GPT-4 and 39.6% more examples than an engineered transpiler. We also share a training and evaluation dataset for this task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14396v2-abstract-full').style.display = 'none'; document.getElementById('2309.14396v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.11992">arXiv:2308.11992</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.11992">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Tissues and Organs">q-bio.TO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Critical Evaluation of Artificial Intelligence as Digital Twin of Pathologist for Prostate Cancer Pathology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Eminaga%2C+O">Okyaz Eminaga</a>, <a href="/search/cs?searchtype=author&amp;query=Abbas%2C+M">Mahmoud Abbas</a>, <a href="/search/cs?searchtype=author&amp;query=Kunder%2C+C">Christian Kunder</a>, <a href="/search/cs?searchtype=author&amp;query=Tolkach%2C+Y">Yuri Tolkach</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+R">Ryan Han</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+J+D">James D. Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Nolley%2C+R">Rosalie Nolley</a>, <a href="/search/cs?searchtype=author&amp;query=Semjonow%2C+A">Axel Semjonow</a>, <a href="/search/cs?searchtype=author&amp;query=Boegemann%2C+M">Martin Boegemann</a>, <a href="/search/cs?searchtype=author&amp;query=West%2C+R">Robert West</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+J">Jin Long</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+R">Richard Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Bettendorf%2C+O">Olaf Bettendorf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.11992v1-abstract-short" style="display: inline;"> Prostate cancer pathology plays a crucial role in clinical management but is time-consuming. Artificial intelligence (AI) shows promise in detecting prostate cancer and grading patterns. We tested an AI-based digital twin of a pathologist, vPatho, on 2,603 histology images of prostate tissue stained with hematoxylin and eosin. We analyzed various factors influencing tumor-grade disagreement betwee&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11992v1-abstract-full').style.display = 'inline'; document.getElementById('2308.11992v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.11992v1-abstract-full" style="display: none;"> Prostate cancer pathology plays a crucial role in clinical management but is time-consuming. Artificial intelligence (AI) shows promise in detecting prostate cancer and grading patterns. We tested an AI-based digital twin of a pathologist, vPatho, on 2,603 histology images of prostate tissue stained with hematoxylin and eosin. We analyzed various factors influencing tumor-grade disagreement between vPatho and six human pathologists. Our results demonstrated that vPatho achieved comparable performance in prostate cancer detection and tumor volume estimation, as reported in the literature. Concordance levels between vPatho and human pathologists were examined. Notably, moderate to substantial agreement was observed in identifying complementary histological features such as ductal, cribriform, nerve, blood vessels, and lymph cell infiltrations. However, concordance in tumor grading showed a decline when applied to prostatectomy specimens (kappa = 0.44) compared to biopsy cores (kappa = 0.70). Adjusting the decision threshold for the secondary Gleason pattern from 5% to 10% improved the concordance level between pathologists and vPatho for tumor grading on prostatectomy specimens (kappa from 0.44 to 0.64). Potential causes of grade discordance included the vertical extent of tumors toward the prostate boundary and the proportions of slides with prostate cancer. Gleason pattern 4 was particularly associated with discordance. Notably, grade discordance with vPatho was not specific to any of the six pathologists involved in routine clinical grading. In conclusion, our study highlights the potential utility of AI in developing a digital twin of a pathologist. This approach can help uncover limitations in AI adoption and the current grading system for prostate cancer pathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11992v1-abstract-full').style.display = 'none'; document.getElementById('2308.11992v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.01753">arXiv:2307.01753</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.01753">pdf</a>, <a href="https://arxiv.org/format/2307.01753">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cosmology and Nongalactic Astrophysics">astro-ph.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1093/mnras/stae886">10.1093/mnras/stae886 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Local primordial non-Gaussianity from the large-scale clustering of photometric DESI luminous red galaxies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rezaie%2C+M">Mehdi Rezaie</a>, <a href="/search/cs?searchtype=author&amp;query=Ross%2C+A+J">Ashley J. Ross</a>, <a href="/search/cs?searchtype=author&amp;query=Seo%2C+H">Hee-Jong Seo</a>, <a href="/search/cs?searchtype=author&amp;query=Kong%2C+H">Hui Kong</a>, <a href="/search/cs?searchtype=author&amp;query=Porredon%2C+A">Anna Porredon</a>, <a href="/search/cs?searchtype=author&amp;query=Samushia%2C+L">Lado Samushia</a>, <a href="/search/cs?searchtype=author&amp;query=Chaussidon%2C+E">Edmond Chaussidon</a>, <a href="/search/cs?searchtype=author&amp;query=Krolewski%2C+A">Alex Krolewski</a>, <a href="/search/cs?searchtype=author&amp;query=de+Mattia%2C+A">Arnaud de Mattia</a>, <a href="/search/cs?searchtype=author&amp;query=Beutler%2C+F">Florian Beutler</a>, <a href="/search/cs?searchtype=author&amp;query=Aguilar%2C+J+N">Jessica Nicole Aguilar</a>, <a href="/search/cs?searchtype=author&amp;query=Ahlen%2C+S">Steven Ahlen</a>, <a href="/search/cs?searchtype=author&amp;query=Alam%2C+S">Shadab Alam</a>, <a href="/search/cs?searchtype=author&amp;query=Avila%2C+S">Santiago Avila</a>, <a href="/search/cs?searchtype=author&amp;query=Bahr-Kalus%2C+B">Benedict Bahr-Kalus</a>, <a href="/search/cs?searchtype=author&amp;query=Bermejo-Climent%2C+J">Jose Bermejo-Climent</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Claybaugh%2C+T">Todd Claybaugh</a>, <a href="/search/cs?searchtype=author&amp;query=Cole%2C+S">Shaun Cole</a>, <a href="/search/cs?searchtype=author&amp;query=Dawson%2C+K">Kyle Dawson</a>, <a href="/search/cs?searchtype=author&amp;query=de+la+Macorra%2C+A">Axel de la Macorra</a>, <a href="/search/cs?searchtype=author&amp;query=Doel%2C+P">Peter Doel</a>, <a href="/search/cs?searchtype=author&amp;query=Font-Ribera%2C+A">Andreu Font-Ribera</a>, <a href="/search/cs?searchtype=author&amp;query=Forero-Romero%2C+J+E">Jaime E. Forero-Romero</a>, <a href="/search/cs?searchtype=author&amp;query=Gontcho%2C+S+G+A">Satya Gontcho A Gontcho</a> , et al. (24 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.01753v3-abstract-short" style="display: inline;"> We use angular clustering of luminous red galaxies from the Dark Energy Spectroscopic Instrument (DESI) imaging surveys to constrain the local primordial non-Gaussianity parameter $\fnl$. Our sample comprises over 12 million targets, covering 14,000 square degrees of the sky, with redshifts in the range $0.2&lt; z &lt; 1.35$. We identify Galactic extinction, survey depth, and astronomical seeing as the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.01753v3-abstract-full').style.display = 'inline'; document.getElementById('2307.01753v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.01753v3-abstract-full" style="display: none;"> We use angular clustering of luminous red galaxies from the Dark Energy Spectroscopic Instrument (DESI) imaging surveys to constrain the local primordial non-Gaussianity parameter $\fnl$. Our sample comprises over 12 million targets, covering 14,000 square degrees of the sky, with redshifts in the range $0.2&lt; z &lt; 1.35$. We identify Galactic extinction, survey depth, and astronomical seeing as the primary sources of systematic error, and employ linear regression and artificial neural networks to alleviate non-cosmological excess clustering on large scales. Our methods are tested against simulations with and without $\fnl$ and systematics, showing superior performance of the neural network treatment. The neural network with a set of nine imaging property maps passes our systematic null test criteria, and is chosen as the fiducial treatment. Assuming the universality relation, we find $\fnl = 34^{+24(+50)}_{-44(-73)}$ at 68\%(95\%) confidence. We apply a series of robustness tests (e.g., cuts on imaging, declination, or scales used) that show consistency in the obtained constraints. We study how the regression method biases the measured angular power-spectrum and degrades the $\fnl$ constraining power. The use of the nine maps more than doubles the uncertainty compared to using only the three primary maps in the regression. Our results thus motivate the development of more efficient methods that avoid over-correction, protect large-scale clustering information, and preserve constraining power. Additionally, our results encourage further studies of $\fnl$ with DESI spectroscopic samples, where the inclusion of 3D clustering modes should help separate imaging systematics and lessen the degradation in the $\fnl$ uncertainty. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.01753v3-abstract-full').style.display = 'none'; document.getElementById('2307.01753v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 17 figures, 7 tables (Appendix excluded). Published in MNRAS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.08162">arXiv:2306.08162</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.08162">pdf</a>, <a href="https://arxiv.org/format/2306.08162">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> INT2.1: Towards Fine-Tunable Quantized Large Language Models with Error Correction through Low-Rank Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chai%2C+Y">Yuji Chai</a>, <a href="/search/cs?searchtype=author&amp;query=Gkountouras%2C+J">John Gkountouras</a>, <a href="/search/cs?searchtype=author&amp;query=Ko%2C+G+G">Glenn G. Ko</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.08162v1-abstract-short" style="display: inline;"> We introduce a method that dramatically reduces fine-tuning VRAM requirements and rectifies quantization errors in quantized Large Language Models. First, we develop an extremely memory-efficient fine-tuning (EMEF) method for quantized models using Low-Rank Adaptation (LoRA), and drawing upon it, we construct an error-correcting algorithm designed to minimize errors induced by the quantization pro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08162v1-abstract-full').style.display = 'inline'; document.getElementById('2306.08162v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.08162v1-abstract-full" style="display: none;"> We introduce a method that dramatically reduces fine-tuning VRAM requirements and rectifies quantization errors in quantized Large Language Models. First, we develop an extremely memory-efficient fine-tuning (EMEF) method for quantized models using Low-Rank Adaptation (LoRA), and drawing upon it, we construct an error-correcting algorithm designed to minimize errors induced by the quantization process. Our method reduces the memory requirements by up to 5.6 times, which enables fine-tuning a 7 billion parameter Large Language Model (LLM) on consumer laptops. At the same time, we propose a Low-Rank Error Correction (LREC) method that exploits the added LoRA layers to ameliorate the gap between the quantized model and its float point counterpart. Our error correction framework leads to a fully functional INT2 quantized LLM with the capacity to generate coherent English text. To the best of our knowledge, this is the first INT2 Large Language Model that has been able to reach such a performance. The overhead of our method is merely a 1.05 times increase in model size, which translates to an effective precision of INT2.1. Also, our method readily generalizes to other quantization standards, such as INT3, INT4, and INT8, restoring their lost performance, which marks a significant milestone in the field of model quantization. The strategies delineated in this paper hold promising implications for the future development and optimization of quantized models, marking a pivotal shift in the landscape of low-resource machine learning computations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08162v1-abstract-full').style.display = 'none'; document.getElementById('2306.08162v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.06000">arXiv:2306.06000</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.06000">pdf</a>, <a href="https://arxiv.org/format/2306.06000">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> S$^{3}$: Increasing GPU Utilization during Generative Inference for Higher Throughput </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yunho Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Chun-Feng Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.06000v1-abstract-short" style="display: inline;"> Generating texts with a large language model (LLM) consumes massive amounts of memory. Apart from the already-large model parameters, the key/value (KV) cache that holds information about previous tokens in a sequence can grow to be even larger than the model itself. This problem is exacerbated in one of the current LLM serving frameworks which reserves the maximum sequence length of memory for th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06000v1-abstract-full').style.display = 'inline'; document.getElementById('2306.06000v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.06000v1-abstract-full" style="display: none;"> Generating texts with a large language model (LLM) consumes massive amounts of memory. Apart from the already-large model parameters, the key/value (KV) cache that holds information about previous tokens in a sequence can grow to be even larger than the model itself. This problem is exacerbated in one of the current LLM serving frameworks which reserves the maximum sequence length of memory for the KV cache to guarantee generating a complete sequence as they do not know the output sequence length. This restricts us to use a smaller batch size leading to lower GPU utilization and above all, lower throughput. We argue that designing a system with a priori knowledge of the output sequence can mitigate this problem. To this end, we propose S$^{3}$, which predicts the output sequence length, schedules generation queries based on the prediction to increase device resource utilization and throughput, and handle mispredictions. Our proposed method achieves 6.49$\times$ throughput over those systems that assume the worst case for the output sequence length. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06000v1-abstract-full').style.display = 'none'; document.getElementById('2306.06000v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.03148">arXiv:2305.03148</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.03148">pdf</a>, <a href="https://arxiv.org/format/2305.03148">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> CAMEL: Co-Designing AI Models and Embedded DRAMs for Efficient On-Device Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S+Q">Sai Qian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tambe%2C+T">Thierry Tambe</a>, <a href="/search/cs?searchtype=author&amp;query=Cuevas%2C+N">Nestor Cuevas</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.03148v3-abstract-short" style="display: inline;"> On-device learning allows AI models to adapt to user data, thereby enhancing service quality on edge platforms. However, training AI on resource-limited devices poses significant challenges due to the demanding computing workload and the substantial memory consumption and data access required by deep neural networks (DNNs). To address these issues, we propose utilizing embedded dynamic random-acce&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.03148v3-abstract-full').style.display = 'inline'; document.getElementById('2305.03148v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.03148v3-abstract-full" style="display: none;"> On-device learning allows AI models to adapt to user data, thereby enhancing service quality on edge platforms. However, training AI on resource-limited devices poses significant challenges due to the demanding computing workload and the substantial memory consumption and data access required by deep neural networks (DNNs). To address these issues, we propose utilizing embedded dynamic random-access memory (eDRAM) as the primary storage medium for transient training data. In comparison to static random-access memory (SRAM), eDRAM provides higher storage density and lower leakage power, resulting in reduced access cost and power leakage. Nevertheless, to maintain the integrity of the stored data, periodic power-hungry refresh operations could potentially degrade system performance. To minimize the occurrence of expensive eDRAM refresh operations, it is beneficial to shorten the lifetime of stored data during the training process. To achieve this, we adopt the principles of algorithm and hardware co-design, introducing a family of reversible DNN architectures that effectively decrease data lifetime and storage costs throughout training. Additionally, we present a highly efficient on-device training engine named \textit{CAMEL}, which leverages eDRAM as the primary on-chip memory. This engine enables efficient on-device training with significantly reduced memory usage and off-chip DRAM traffic while maintaining superior training accuracy. We evaluate our CAMEL system on multiple DNNs with different datasets, demonstrating a $2.5\times$ speedup of the training process and $2.8\times$ training energy savings than the other baseline hardware platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.03148v3-abstract-full').style.display = 'none'; document.getElementById('2305.03148v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.01831">arXiv:2305.01831</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.01831">pdf</a>, <a href="https://arxiv.org/format/2305.01831">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Design Space Exploration and Optimization for Carbon-Efficient Extended Reality Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Elgamal%2C+M">Mariam Elgamal</a>, <a href="/search/cs?searchtype=author&amp;query=Carmean%2C+D">Doug Carmean</a>, <a href="/search/cs?searchtype=author&amp;query=Ansari%2C+E">Elnaz Ansari</a>, <a href="/search/cs?searchtype=author&amp;query=Zed%2C+O">Okay Zed</a>, <a href="/search/cs?searchtype=author&amp;query=Peri%2C+R">Ramesh Peri</a>, <a href="/search/cs?searchtype=author&amp;query=Manne%2C+S">Srilatha Manne</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Hills%2C+G">Gage Hills</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.01831v1-abstract-short" style="display: inline;"> As computing hardware becomes more specialized, designing environmentally sustainable computing systems requires accounting for both hardware and software parameters. Our goal is to design low carbon computing systems while maintaining a competitive level of performance and operational efficiency. Despite previous carbon modeling efforts for computing systems, there is a distinct lack of holistic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01831v1-abstract-full').style.display = 'inline'; document.getElementById('2305.01831v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.01831v1-abstract-full" style="display: none;"> As computing hardware becomes more specialized, designing environmentally sustainable computing systems requires accounting for both hardware and software parameters. Our goal is to design low carbon computing systems while maintaining a competitive level of performance and operational efficiency. Despite previous carbon modeling efforts for computing systems, there is a distinct lack of holistic design strategies to simultaneously optimize for carbon, performance, power and energy. In this work, we take a data-driven approach to characterize the carbon impact (quantified in units of CO2e) of various artificial intelligence (AI) and extended reality (XR) production-level hardware and application use-cases. We propose a holistic design exploration framework to optimize and design for carbon-efficient computing systems and hardware. Our frameworks identifies significant opportunities for carbon efficiency improvements in application-specific and general purpose hardware design and optimization. Using our framework, we demonstrate 10$\times$ carbon efficiency improvement for specialized AI and XR accelerators (quantified by a key metric, tCDP: the product of total CO2e and total application execution time), up to 21% total life cycle carbon savings for existing general-purpose hardware and applications due to hardware over-provisioning, and up to 7.86$\times$ carbon efficiency improvement using advanced 3D integration techniques for resource-constrained XR systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01831v1-abstract-full').style.display = 'none'; document.getElementById('2305.01831v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.00404">arXiv:2304.00404</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.00404">pdf</a>, <a href="https://arxiv.org/format/2304.00404">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> GreenScale: Carbon-Aware Systems for Edge Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+Y+G">Young Geun Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=McCrabb%2C+A">Andrew McCrabb</a>, <a href="/search/cs?searchtype=author&amp;query=Son%2C+Y">Yonglak Son</a>, <a href="/search/cs?searchtype=author&amp;query=Bertacco%2C+V">Valeria Bertacco</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.00404v1-abstract-short" style="display: inline;"> To improve the environmental implications of the growing demand of computing, future applications need to improve the carbon-efficiency of computing infrastructures. State-of-the-art approaches, however, do not consider the intermittent nature of renewable energy. The time and location-based carbon intensity of energy fueling computing has been ignored when determining how computation is carried o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.00404v1-abstract-full').style.display = 'inline'; document.getElementById('2304.00404v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.00404v1-abstract-full" style="display: none;"> To improve the environmental implications of the growing demand of computing, future applications need to improve the carbon-efficiency of computing infrastructures. State-of-the-art approaches, however, do not consider the intermittent nature of renewable energy. The time and location-based carbon intensity of energy fueling computing has been ignored when determining how computation is carried out. This poses a new challenge -- deciding when and where to run applications across consumer devices at the edge and servers in the cloud. Such scheduling decisions become more complicated with the stochastic runtime variance and the amortization of the rising embodied emissions. This work proposes GreenScale, a framework to understand the design and optimization space of carbon-aware scheduling for green applications across the edge-cloud infrastructure. Based on the quantified carbon output of the infrastructure components, we demonstrate that optimizing for carbon, compared to performance and energy efficiency, yields unique scheduling solutions. Our evaluation with three representative categories of applications (i.e., AI, Game, and AR/VR) demonstrate that the carbon emissions of the applications can be reduced by up to 29.1% with the GreenScale. The analysis in this work further provides a detailed road map for edge-cloud application developers to build green applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.00404v1-abstract-full').style.display = 'none'; document.getElementById('2304.00404v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.10872">arXiv:2302.10872</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.10872">pdf</a>, <a href="https://arxiv.org/format/2302.10872">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MP-Rec: Hardware-Software Co-Design to Enable Multi-Path Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsia%2C+S">Samuel Hsia</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Acun%2C+B">Bilge Acun</a>, <a href="/search/cs?searchtype=author&amp;query=Ardalani%2C+N">Newsha Ardalani</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+P">Pan Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.10872v1-abstract-short" style="display: inline;"> Deep learning recommendation systems serve personalized content under diverse tail-latency targets and input-query loads. In order to do so, state-of-the-art recommendation models rely on terabyte-scale embedding tables to learn user preferences over large bodies of contents. The reliance on a fixed embedding representation of embedding tables not only imposes significant memory capacity and bandw&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10872v1-abstract-full').style.display = 'inline'; document.getElementById('2302.10872v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.10872v1-abstract-full" style="display: none;"> Deep learning recommendation systems serve personalized content under diverse tail-latency targets and input-query loads. In order to do so, state-of-the-art recommendation models rely on terabyte-scale embedding tables to learn user preferences over large bodies of contents. The reliance on a fixed embedding representation of embedding tables not only imposes significant memory capacity and bandwidth requirements but also limits the scope of compatible system solutions. This paper challenges the assumption of fixed embedding representations by showing how synergies between embedding representations and hardware platforms can lead to improvements in both algorithmic- and system performance. Based on our characterization of various embedding representations, we propose a hybrid embedding representation that achieves higher quality embeddings at the cost of increased memory and compute requirements. To address the system performance challenges of the hybrid representation, we propose MP-Rec -- a co-design technique that exploits heterogeneity and dynamic selection of embedding representations and underlying hardware platforms. On real system hardware, we demonstrate how matching custom accelerators, i.e., GPUs, TPUs, and IPUs, with compatible embedding representations can lead to 16.65x performance speedup. Additionally, in query-serving scenarios, MP-Rec achieves 2.49x and 3.76x higher correct prediction throughput and 0.19% and 0.22% better model quality on a CPU-GPU system for the Kaggle and Terabyte datasets, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10872v1-abstract-full').style.display = 'none'; document.getElementById('2302.10872v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> C.1; H.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.11273">arXiv:2301.11273</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.11273">pdf</a>, <a href="https://arxiv.org/format/2301.11273">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AlignGraph: A Group of Generative Models for Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shayestehfard%2C+K">Kimia Shayestehfard</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">Dana Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Ioannidis%2C+S">Stratis Ioannidis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.11273v1-abstract-short" style="display: inline;"> It is challenging for generative models to learn a distribution over graphs because of the lack of permutation invariance: nodes may be ordered arbitrarily across graphs, and standard graph alignment is combinatorial and notoriously expensive. We propose AlignGraph, a group of generative models that combine fast and efficient graph alignment methods with a family of deep generative models that are&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.11273v1-abstract-full').style.display = 'inline'; document.getElementById('2301.11273v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.11273v1-abstract-full" style="display: none;"> It is challenging for generative models to learn a distribution over graphs because of the lack of permutation invariance: nodes may be ordered arbitrarily across graphs, and standard graph alignment is combinatorial and notoriously expensive. We propose AlignGraph, a group of generative models that combine fast and efficient graph alignment methods with a family of deep generative models that are invariant to node permutations. Our experiments demonstrate that our framework successfully learns graph distributions, outperforming competitors by 25% -560% in relevant performance scores. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.11273v1-abstract-full').style.display = 'none'; document.getElementById('2301.11273v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 2 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.10999">arXiv:2301.10999</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.10999">pdf</a>, <a href="https://arxiv.org/format/2301.10999">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> PerfSAGE: Generalized Inference Performance Predictor for Arbitrary Deep Learning Models on Edge Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chai%2C+Y">Yuji Chai</a>, <a href="/search/cs?searchtype=author&amp;query=Tripathy%2C+D">Devashree Tripathy</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+C">Chuteng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Gope%2C+D">Dibakar Gope</a>, <a href="/search/cs?searchtype=author&amp;query=Fedorov%2C+I">Igor Fedorov</a>, <a href="/search/cs?searchtype=author&amp;query=Matas%2C+R">Ramon Matas</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Whatmough%2C+P">Paul Whatmough</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.10999v1-abstract-short" style="display: inline;"> The ability to accurately predict deep neural network (DNN) inference performance metrics, such as latency, power, and memory footprint, for an arbitrary DNN on a target hardware platform is essential to the design of DNN based models. This ability is critical for the (manual or automatic) design, optimization, and deployment of practical DNNs for a specific hardware deployment platform. Unfortuna&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10999v1-abstract-full').style.display = 'inline'; document.getElementById('2301.10999v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.10999v1-abstract-full" style="display: none;"> The ability to accurately predict deep neural network (DNN) inference performance metrics, such as latency, power, and memory footprint, for an arbitrary DNN on a target hardware platform is essential to the design of DNN based models. This ability is critical for the (manual or automatic) design, optimization, and deployment of practical DNNs for a specific hardware deployment platform. Unfortunately, these metrics are slow to evaluate using simulators (where available) and typically require measurement on the target hardware. This work describes PerfSAGE, a novel graph neural network (GNN) that predicts inference latency, energy, and memory footprint on an arbitrary DNN TFlite graph (TFL, 2017). In contrast, previously published performance predictors can only predict latency and are restricted to pre-defined construction rules or search spaces. This paper also describes the EdgeDLPerf dataset of 134,912 DNNs randomly sampled from four task search spaces and annotated with inference performance metrics from three edge hardware platforms. Using this dataset, we train PerfSAGE and provide experimental results that demonstrate state-of-the-art prediction accuracy with a Mean Absolute Percentage Error of &lt;5% across all targets and model search spaces. These results: (1) Outperform previous state-of-art GNN-based predictors (Dudziak et al., 2020), (2) Accurately predict performance on accelerators (a shortfall of non-GNN-based predictors (Zhang et al., 2021)), and (3) Demonstrate predictions on arbitrary input graphs without modifications to the feature extractor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10999v1-abstract-full').style.display = 'none'; document.getElementById('2301.10999v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.10904">arXiv:2301.10904</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.10904">pdf</a>, <a href="https://arxiv.org/format/2301.10904">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GPU-based Private Information Retrieval for On-Device Machine Learning Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lam%2C+M">Maximilian Lam</a>, <a href="/search/cs?searchtype=author&amp;query=Johnson%2C+J">Jeff Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+W">Wenjie Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Maeng%2C+K">Kiwan Maeng</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+L">Liangzhen Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Leontiadis%2C+I">Ilias Leontiadis</a>, <a href="/search/cs?searchtype=author&amp;query=Rhu%2C+M">Minsoo Rhu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H+S">Hsien-Hsin S. Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Reddi%2C+V+J">Vijay Janapa Reddi</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Suh%2C+G+E">G. Edward Suh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.10904v3-abstract-short" style="display: inline;"> On-device machine learning (ML) inference can enable the use of private user data on user devices without revealing them to remote servers. However, a pure on-device solution to private ML inference is impractical for many applications that rely on embedding tables that are too large to be stored on-device. In particular, recommendation models typically use multiple embedding tables each on the or&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10904v3-abstract-full').style.display = 'inline'; document.getElementById('2301.10904v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.10904v3-abstract-full" style="display: none;"> On-device machine learning (ML) inference can enable the use of private user data on user devices without revealing them to remote servers. However, a pure on-device solution to private ML inference is impractical for many applications that rely on embedding tables that are too large to be stored on-device. In particular, recommendation models typically use multiple embedding tables each on the order of 1-10 GBs of data, making them impractical to store on-device. To overcome this barrier, we propose the use of private information retrieval (PIR) to efficiently and privately retrieve embeddings from servers without sharing any private information. As off-the-shelf PIR algorithms are usually too computationally intensive to directly use for latency-sensitive inference tasks, we 1) propose novel GPU-based acceleration of PIR, and 2) co-design PIR with the downstream ML application to obtain further speedup. Our GPU acceleration strategy improves system throughput by more than $20 \times$ over an optimized CPU PIR implementation, and our PIR-ML co-design provides an over $5 \times$ additional throughput improvement at fixed model quality. Together, for various on-device ML applications such as recommendation and language modeling, our system on a single V100 GPU can serve up to $100,000$ queries per second -- a $&gt;100 \times$ throughput improvement over a CPU-based baseline -- while maintaining model accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10904v3-abstract-full').style.display = 'none'; document.getElementById('2301.10904v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.00827">arXiv:2212.00827</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.00827">pdf</a>, <a href="https://arxiv.org/format/2212.00827">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Architectural Implications of Embedding Dimension during GCN on CPU and GPU </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Adiletta%2C+M">Matthew Adiletta</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.00827v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) are a class of neural networks designed to extract information from the graphical structure of data. Graph Convolutional Networks (GCNs) are a widely used type of GNN for transductive graph learning problems which apply convolution to learn information from graphs. GCN is a challenging algorithm from an architecture perspective due to inherent sparsity, low data reuse,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00827v1-abstract-full').style.display = 'inline'; document.getElementById('2212.00827v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.00827v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) are a class of neural networks designed to extract information from the graphical structure of data. Graph Convolutional Networks (GCNs) are a widely used type of GNN for transductive graph learning problems which apply convolution to learn information from graphs. GCN is a challenging algorithm from an architecture perspective due to inherent sparsity, low data reuse, and massive memory capacity requirements. Traditional neural algorithms exploit the high compute capacity of GPUs to achieve high performance for both inference and training. The architectural decision to use a GPU for GCN inference is a question explored in this work. GCN on both CPU and GPU was characterized in order to better understand the implications of graph size, embedding dimension, and sampling on performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00827v1-abstract-full').style.display = 'none'; document.getElementById('2212.00827v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.14657">arXiv:2209.14657</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.14657">pdf</a>, <a href="https://arxiv.org/format/2209.14657">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Correlated Feature Aggregation by Region Helps Distinguish Aggressive from Indolent Clear Cell Renal Cell Carcinoma Subtypes on CT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Stacke%2C+K">Karin Stacke</a>, <a href="/search/cs?searchtype=author&amp;query=Bhattacharya%2C+I">Indrani Bhattacharya</a>, <a href="/search/cs?searchtype=author&amp;query=Tse%2C+J+R">Justin R. Tse</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+J+D">James D. Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Sonn%2C+G+A">Geoffrey A. Sonn</a>, <a href="/search/cs?searchtype=author&amp;query=Rusu%2C+M">Mirabela Rusu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.14657v1-abstract-short" style="display: inline;"> Renal cell carcinoma (RCC) is a common cancer that varies in clinical behavior. Indolent RCC is often low-grade without necrosis and can be monitored without treatment. Aggressive RCC is often high-grade and can cause metastasis and death if not promptly detected and treated. While most kidney cancers are detected on CT scans, grading is based on histology from invasive biopsy or surgery. Determin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.14657v1-abstract-full').style.display = 'inline'; document.getElementById('2209.14657v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.14657v1-abstract-full" style="display: none;"> Renal cell carcinoma (RCC) is a common cancer that varies in clinical behavior. Indolent RCC is often low-grade without necrosis and can be monitored without treatment. Aggressive RCC is often high-grade and can cause metastasis and death if not promptly detected and treated. While most kidney cancers are detected on CT scans, grading is based on histology from invasive biopsy or surgery. Determining aggressiveness on CT images is clinically important as it facilitates risk stratification and treatment planning. This study aims to use machine learning methods to identify radiology features that correlate with features on pathology to facilitate assessment of cancer aggressiveness on CT images instead of histology. This paper presents a novel automated method, Correlated Feature Aggregation By Region (CorrFABR), for classifying aggressiveness of clear cell RCC by leveraging correlations between radiology and corresponding unaligned pathology images. CorrFABR consists of three main steps: (1) Feature Aggregation where region-level features are extracted from radiology and pathology images, (2) Fusion where radiology features correlated with pathology features are learned on a region level, and (3) Prediction where the learned correlated features are used to distinguish aggressive from indolent clear cell RCC using CT alone as input. Thus, during training, CorrFABR learns from both radiology and pathology images, but during inference, CorrFABR will distinguish aggressive from indolent clear cell RCC using CT alone, in the absence of pathology images. CorrFABR improved classification performance over radiology features alone, with an increase in binary classification F1-score from 0.68 (0.04) to 0.73 (0.03). This demonstrates the potential of incorporating pathology disease characteristics for improved classification of aggressiveness of clear cell RCC on CT images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.14657v1-abstract-full').style.display = 'none'; document.getElementById('2209.14657v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Medical Image Analysis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.12127">arXiv:2209.12127</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.12127">pdf</a>, <a href="https://arxiv.org/format/2209.12127">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SpeedLimit: Neural Architecture Search for Quantized Transformer Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chai%2C+Y">Yuji Chai</a>, <a href="/search/cs?searchtype=author&amp;query=Bailey%2C+L">Luke Bailey</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yunho Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Karle%2C+M">Matthew Karle</a>, <a href="/search/cs?searchtype=author&amp;query=Ko%2C+G+G">Glenn G. Ko</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Kung%2C+H+T">H. T. Kung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.12127v3-abstract-short" style="display: inline;"> While research in the field of transformer models has primarily focused on enhancing performance metrics such as accuracy and perplexity, practical applications in industry often necessitate a rigorous consideration of inference latency constraints. Addressing this challenge, we introduce SpeedLimit, a novel Neural Architecture Search (NAS) technique that optimizes accuracy whilst adhering to an u&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.12127v3-abstract-full').style.display = 'inline'; document.getElementById('2209.12127v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.12127v3-abstract-full" style="display: none;"> While research in the field of transformer models has primarily focused on enhancing performance metrics such as accuracy and perplexity, practical applications in industry often necessitate a rigorous consideration of inference latency constraints. Addressing this challenge, we introduce SpeedLimit, a novel Neural Architecture Search (NAS) technique that optimizes accuracy whilst adhering to an upper-bound latency constraint. Our method incorporates 8-bit integer quantization in the search process to outperform the current state-of-the-art technique. Our results underline the feasibility and efficacy of seeking an optimal balance between performance and latency, providing new avenues for deploying state-of-the-art transformer models in latency-sensitive environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.12127v3-abstract-full').style.display = 'none'; document.getElementById('2209.12127v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.06437">arXiv:2205.06437</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.06437">pdf</a>, <a href="https://arxiv.org/format/2205.06437">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Impala: Low-Latency, Communication-Efficient Private Deep Learning Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Choi%2C+W">Woo-Seok Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Reagen%2C+B">Brandon Reagen</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.06437v1-abstract-short" style="display: inline;"> This paper proposes Impala, a new cryptographic protocol for private inference in the client-cloud setting. Impala builds upon recent solutions that combine the complementary strengths of homomorphic encryption (HE) and secure multi-party computation (MPC). A series of protocol optimizations are developed to reduce both communication and performance bottlenecks. First, we remove MPC&#39;s overwhelming&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.06437v1-abstract-full').style.display = 'inline'; document.getElementById('2205.06437v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.06437v1-abstract-full" style="display: none;"> This paper proposes Impala, a new cryptographic protocol for private inference in the client-cloud setting. Impala builds upon recent solutions that combine the complementary strengths of homomorphic encryption (HE) and secure multi-party computation (MPC). A series of protocol optimizations are developed to reduce both communication and performance bottlenecks. First, we remove MPC&#39;s overwhelmingly high communication cost from the client by introducing a proxy server and developing a low-overhead key switching technique. Key switching reduces the clients bandwidth by multiple orders of magnitude, however the communication between the proxy and cloud is still excessive. Second, to we develop an optimized garbled circuit that leverages truncated secret shares for faster evaluation and less proxy-cloud communication. Finally, we propose sparse HE convolution to reduce the computational bottleneck of using HE. Compared to the state-of-the-art, these optimizations provide a bandwidth savings of over 3X and speedup of 4X for private deep learning inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.06437v1-abstract-full').style.display = 'none'; document.getElementById('2205.06437v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.03325">arXiv:2205.03325</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.03325">pdf</a>, <a href="https://arxiv.org/format/2205.03325">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> OMU: A Probabilistic 3D Occupancy Mapping Accelerator for Real-time OctoMap at the Edge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jia%2C+T">Tianyu Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+E">En-Yu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Hsiao%2C+Y">Yu-Shun Hsiao</a>, <a href="/search/cs?searchtype=author&amp;query=Cruz%2C+J">Jonathan Cruz</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Reddi%2C+V+J">Vijay Janapa Reddi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.03325v1-abstract-short" style="display: inline;"> Autonomous machines (e.g., vehicles, mobile robots, drones) require sophisticated 3D mapping to perceive the dynamic environment. However, maintaining a real-time 3D map is expensive both in terms of compute and memory requirements, especially for resource-constrained edge machines. Probabilistic OctoMap is a reliable and memory-efficient 3D dense map model to represent the full environment, with&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.03325v1-abstract-full').style.display = 'inline'; document.getElementById('2205.03325v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.03325v1-abstract-full" style="display: none;"> Autonomous machines (e.g., vehicles, mobile robots, drones) require sophisticated 3D mapping to perceive the dynamic environment. However, maintaining a real-time 3D map is expensive both in terms of compute and memory requirements, especially for resource-constrained edge machines. Probabilistic OctoMap is a reliable and memory-efficient 3D dense map model to represent the full environment, with dynamic voxel node pruning and expansion capacity. This paper presents the first efficient accelerator solution, i.e. OMU, to enable real-time probabilistic 3D mapping at the edge. To improve the performance, the input map voxels are updated via parallel PE units for data parallelism. Within each PE, the voxels are stored using a specially developed data structure in parallel memory banks. In addition, a pruning address manager is designed within each PE unit to reuse the pruned memory addresses. The proposed 3D mapping accelerator is implemented and evaluated using a commercial 12 nm technology. Compared to the ARM Cortex-A57 CPU in the Nvidia Jetson TX2 platform, the proposed accelerator achieves up to 62$\times$ performance and 708$\times$ energy efficiency improvement. Furthermore, the accelerator provides 63 FPS throughput, more than 2$\times$ higher than a real-time requirement, enabling real-time perception for 3D mapping. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.03325v1-abstract-full').style.display = 'none'; document.getElementById('2205.03325v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2022 Design Automation and Test in Europe Conference (DATE), March 14-23, 2022, Virtual</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.06732">arXiv:2203.06732</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.06732">pdf</a>, <a href="https://arxiv.org/format/2203.06732">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Molecular Networks">q-bio.MN</span> </div> </div> <p class="title is-5 mathjax"> BioSimulators: a central registry of simulation engines and services for recommending specific tools </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shaikh%2C+B">Bilal Shaikh</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+L+P">Lucian P. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Vasilescu%2C+D">Dan Vasilescu</a>, <a href="/search/cs?searchtype=author&amp;query=Marupilla%2C+G">Gnaneswara Marupilla</a>, <a href="/search/cs?searchtype=author&amp;query=Wilson%2C+M">Michael Wilson</a>, <a href="/search/cs?searchtype=author&amp;query=Agmon%2C+E">Eran Agmon</a>, <a href="/search/cs?searchtype=author&amp;query=Agnew%2C+H">Henry Agnew</a>, <a href="/search/cs?searchtype=author&amp;query=Andrews%2C+S+S">Steven S. Andrews</a>, <a href="/search/cs?searchtype=author&amp;query=Anwar%2C+A">Azraf Anwar</a>, <a href="/search/cs?searchtype=author&amp;query=Beber%2C+M+E">Moritz E. Beber</a>, <a href="/search/cs?searchtype=author&amp;query=Bergmann%2C+F+T">Frank T. Bergmann</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Brusch%2C+L">Lutz Brusch</a>, <a href="/search/cs?searchtype=author&amp;query=Calzone%2C+L">Laurence Calzone</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+K">Kiri Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Cooper%2C+J">Joshua Cooper</a>, <a href="/search/cs?searchtype=author&amp;query=Detloff%2C+J">John Detloff</a>, <a href="/search/cs?searchtype=author&amp;query=Drawert%2C+B">Brian Drawert</a>, <a href="/search/cs?searchtype=author&amp;query=Dumontier%2C+M">Michel Dumontier</a>, <a href="/search/cs?searchtype=author&amp;query=Ermentrout%2C+G+B">G. Bard Ermentrout</a>, <a href="/search/cs?searchtype=author&amp;query=Faeder%2C+J+R">James R. Faeder</a>, <a href="/search/cs?searchtype=author&amp;query=Freiburger%2C+A+P">Andrew P. Freiburger</a>, <a href="/search/cs?searchtype=author&amp;query=Fr%C3%B6hlich%2C+F">Fabian Fr枚hlich</a>, <a href="/search/cs?searchtype=author&amp;query=Funahashi%2C+A">Akira Funahashi</a>, <a href="/search/cs?searchtype=author&amp;query=Garny%2C+A">Alan Garny</a> , et al. (46 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.06732v1-abstract-short" style="display: inline;"> Computational models have great potential to accelerate bioscience, bioengineering, and medicine. However, it remains challenging to reproduce and reuse simulations, in part, because the numerous formats and methods for simulating various subsystems and scales remain siloed by different software tools. For example, each tool must be executed through a distinct interface. To help investigators find&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.06732v1-abstract-full').style.display = 'inline'; document.getElementById('2203.06732v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.06732v1-abstract-full" style="display: none;"> Computational models have great potential to accelerate bioscience, bioengineering, and medicine. However, it remains challenging to reproduce and reuse simulations, in part, because the numerous formats and methods for simulating various subsystems and scales remain siloed by different software tools. For example, each tool must be executed through a distinct interface. To help investigators find and use simulation tools, we developed BioSimulators (https://biosimulators.org), a central registry of the capabilities of simulation tools and consistent Python, command-line, and containerized interfaces to each version of each tool. The foundation of BioSimulators is standards, such as CellML, SBML, SED-ML, and the COMBINE archive format, and validation tools for simulation projects and simulation tools that ensure these standards are used consistently. To help modelers find tools for particular projects, we have also used the registry to develop recommendation services. We anticipate that BioSimulators will help modelers exchange, reproduce, and combine simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.06732v1-abstract-full').style.display = 'none'; document.getElementById('2203.06732v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.02833">arXiv:2203.02833</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.02833">pdf</a>, <a href="https://arxiv.org/format/2203.02833">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Tabula: Efficiently Computing Nonlinear Activation Functions for Secure Neural Network Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lam%2C+M">Maximilian Lam</a>, <a href="/search/cs?searchtype=author&amp;query=Mitzenmacher%2C+M">Michael Mitzenmacher</a>, <a href="/search/cs?searchtype=author&amp;query=Reddi%2C+V+J">Vijay Janapa Reddi</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.02833v2-abstract-short" style="display: inline;"> Multiparty computation approaches to secure neural network inference commonly rely on garbled circuits for securely executing nonlinear activation functions. However, garbled circuits require excessive communication between server and client, impose significant storage overheads, and incur large runtime penalties. To reduce these costs, we propose an alternative to garbled circuits: Tabula, an alg&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.02833v2-abstract-full').style.display = 'inline'; document.getElementById('2203.02833v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.02833v2-abstract-full" style="display: none;"> Multiparty computation approaches to secure neural network inference commonly rely on garbled circuits for securely executing nonlinear activation functions. However, garbled circuits require excessive communication between server and client, impose significant storage overheads, and incur large runtime penalties. To reduce these costs, we propose an alternative to garbled circuits: Tabula, an algorithm based on secure lookup tables. Our approach precomputes lookup tables during an offline phase that contains the result of all possible nonlinear function calls. Because these tables incur exponential storage costs in the number of operands and the precision of the input values, we use quantization to reduce these storage costs to make this approach practical. This enables an online phase where securely computing the result of a nonlinear function requires just a single round of communication, with communication cost equal to twice the number of bits of the input to the nonlinear function. In practice our approach costs 2 bytes of communication per nonlinear function call in the online phase. Compared to garbled circuits with 8-bit quantized inputs, when computing individual nonlinear functions during the online phase, experiments show Tabula with 8-bit activations uses between $280$-$560 \times$ less communication, is over $100\times$ faster, and uses a comparable (within a factor of 2) amount of storage; compared against other state-of-the-art protocols Tabula achieves greater than $40\times$ communication reduction. This leads to significant performance gains over garbled circuits with quantized inputs during the online phase of secure inference of neural networks: Tabula reduces end-to-end inference communication by up to $9 \times$ and achieves an end-to-end inference speedup of up to $50 \times$, while imposing comparable storage and offline preprocessing costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.02833v2-abstract-full').style.display = 'none'; document.getElementById('2203.02833v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.10036">arXiv:2201.10036</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2201.10036">pdf</a>, <a href="https://arxiv.org/format/2201.10036">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3575693.3575754">10.1145/3575693.3575754 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Carbon Explorer: A Holistic Approach for Designing Carbon Aware Datacenters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Acun%2C+B">Bilge Acun</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+B">Benjamin Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Kazhamiaka%2C+F">Fiodar Kazhamiaka</a>, <a href="/search/cs?searchtype=author&amp;query=Maeng%2C+K">Kiwan Maeng</a>, <a href="/search/cs?searchtype=author&amp;query=Chakkaravarthy%2C+M">Manoj Chakkaravarthy</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.10036v3-abstract-short" style="display: inline;"> Technology companies have been leading the way to a renewable energy transformation, by investing in renewable energy sources to reduce the carbon footprint of their datacenters. In addition to helping build new solar and wind farms, companies make power purchase agreements or purchase carbon offsets, rather than relying on renewable energy every hour of the day, every day of the week (24/7). Rely&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.10036v3-abstract-full').style.display = 'inline'; document.getElementById('2201.10036v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.10036v3-abstract-full" style="display: none;"> Technology companies have been leading the way to a renewable energy transformation, by investing in renewable energy sources to reduce the carbon footprint of their datacenters. In addition to helping build new solar and wind farms, companies make power purchase agreements or purchase carbon offsets, rather than relying on renewable energy every hour of the day, every day of the week (24/7). Relying on renewable energy 24/7 is challenging due to the intermittent nature of wind and solar energy. Inherent variations in solar and wind energy production causes excess or lack of supply at different times. To cope with the fluctuations of renewable energy generation, multiple solutions must be applied. These include: capacity sizing with a mix of solar and wind power, energy storage options, and carbon aware workload scheduling. However, depending on the region and datacenter workload characteristics, the carbon-optimal solution varies. Existing work in this space does not give a holistic view of the trade-offs of each solution and often ignore the embodied carbon cost of the solutions. In this work, we provide a framework, Carbon Explorer, to analyze the multi-dimensional solution space by taking into account operational and embodided footprint of the solutions to help make datacenters operate on renewable energy 24/7. The solutions we analyze include capacity sizing with a mix of solar and wind power, battery storage, and carbon aware workload scheduling, which entails shifting the workloads from times when there is lack of renewable supply to times with abundant supply. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.10036v3-abstract-full').style.display = 'none'; document.getElementById('2201.10036v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at ASPLOS&#39;23: Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> C.0; B.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.08603">arXiv:2201.08603</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2201.08603">pdf</a>, <a href="https://arxiv.org/format/2201.08603">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Trireme: Exploring Hierarchical Multi-Level Parallelism for Domain Specific Hardware Acceleration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zacharopoulos%2C+G">Georgios Zacharopoulos</a>, <a href="/search/cs?searchtype=author&amp;query=Ejjeh%2C+A">Adel Ejjeh</a>, <a href="/search/cs?searchtype=author&amp;query=Jing%2C+Y">Ying Jing</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+E">En-Yu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+T">Tianyu Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Brumar%2C+I">Iulian Brumar</a>, <a href="/search/cs?searchtype=author&amp;query=Intan%2C+J">Jeremy Intan</a>, <a href="/search/cs?searchtype=author&amp;query=Huzaifa%2C+M">Muhammad Huzaifa</a>, <a href="/search/cs?searchtype=author&amp;query=Adve%2C+S">Sarita Adve</a>, <a href="/search/cs?searchtype=author&amp;query=Adve%2C+V">Vikram Adve</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.08603v1-abstract-short" style="display: inline;"> The design of heterogeneous systems that include domain specific accelerators is a challenging and time-consuming process. While taking into account area constraints, designers must decide which parts of an application to accelerate in hardware and which to leave in software. Moreover, applications in domains such as Extended Reality (XR) offer opportunities for various forms of parallel execution&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.08603v1-abstract-full').style.display = 'inline'; document.getElementById('2201.08603v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.08603v1-abstract-full" style="display: none;"> The design of heterogeneous systems that include domain specific accelerators is a challenging and time-consuming process. While taking into account area constraints, designers must decide which parts of an application to accelerate in hardware and which to leave in software. Moreover, applications in domains such as Extended Reality (XR) offer opportunities for various forms of parallel execution, including loop level, task level and pipeline parallelism. To assist the design process and expose every possible level of parallelism, we present Trireme, a fully automated tool-chain that explores multiple levels of parallelism and produces domain specific accelerator designs and configurations that maximize performance, given an area budget. Experiments on demanding benchmarks from the XR domain revealed a speedup of up to 20x, as well as a speedup of up to 37x for smaller applications, compared to software-only implementations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.08603v1-abstract-full').style.display = 'none'; document.getElementById('2201.08603v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.02164">arXiv:2112.02164</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2112.02164">pdf</a>, <a href="https://arxiv.org/format/2112.02164">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1002/mp.15777">10.1002/mp.15777 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Bridging the gap between prostate radiology and pathology through machine learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bhattacharya%2C+I">Indrani Bhattacharya</a>, <a href="/search/cs?searchtype=author&amp;query=Lim%2C+D+S">David S. Lim</a>, <a href="/search/cs?searchtype=author&amp;query=Aung%2C+H+L">Han Lin Aung</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xingchen Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Seetharaman%2C+A">Arun Seetharaman</a>, <a href="/search/cs?searchtype=author&amp;query=Kunder%2C+C+A">Christian A. Kunder</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+W">Wei Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Soerensen%2C+S+J+C">Simon J. C. Soerensen</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+R+E">Richard E. Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanouni%2C+P">Pejman Ghanouni</a>, <a href="/search/cs?searchtype=author&amp;query=To%27o%2C+K+J">Katherine J. To&#39;o</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+J+D">James D. Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Sonn%2C+G+A">Geoffrey A. Sonn</a>, <a href="/search/cs?searchtype=author&amp;query=Rusu%2C+M">Mirabela Rusu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.02164v1-abstract-short" style="display: inline;"> Prostate cancer is the second deadliest cancer for American men. While Magnetic Resonance Imaging (MRI) is increasingly used to guide targeted biopsies for prostate cancer diagnosis, its utility remains limited due to high rates of false positives and false negatives as well as low inter-reader agreements. Machine learning methods to detect and localize cancer on prostate MRI can help standardize&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02164v1-abstract-full').style.display = 'inline'; document.getElementById('2112.02164v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.02164v1-abstract-full" style="display: none;"> Prostate cancer is the second deadliest cancer for American men. While Magnetic Resonance Imaging (MRI) is increasingly used to guide targeted biopsies for prostate cancer diagnosis, its utility remains limited due to high rates of false positives and false negatives as well as low inter-reader agreements. Machine learning methods to detect and localize cancer on prostate MRI can help standardize radiologist interpretations. However, existing machine learning methods vary not only in model architecture, but also in the ground truth labeling strategies used for model training. In this study, we compare different labeling strategies, namely, pathology-confirmed radiologist labels, pathologist labels on whole-mount histopathology images, and lesion-level and pixel-level digital pathologist labels (previously validated deep learning algorithm on histopathology images to predict pixel-level Gleason patterns) on whole-mount histopathology images. We analyse the effects these labels have on the performance of the trained machine learning models. Our experiments show that (1) radiologist labels and models trained with them can miss cancers, or underestimate cancer extent, (2) digital pathologist labels and models trained with them have high concordance with pathologist labels, and (3) models trained with digital pathologist labels achieve the best performance in prostate cancer detection in two different cohorts with different disease distributions, irrespective of the model architecture used. Digital pathologist labels can reduce challenges associated with human annotations, including labor, time, inter- and intra-reader variability, and can help bridge the gap between prostate radiology and pathology by enabling the training of reliable machine learning models to detect and localize prostate cancer on MRI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02164v1-abstract-full').style.display = 'none'; document.getElementById('2112.02164v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Indrani Bhattacharya and David S. Lim contributed equally as first authors. Geoffrey A. Sonn and Mirabela Rusu contributed equally as senior authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.09222">arXiv:2111.09222</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.09222">pdf</a>, <a href="https://arxiv.org/format/2111.09222">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Early DSE and Automatic Generation of Coarse Grained Merged Accelerators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Brumar%2C+I">Iulian Brumar</a>, <a href="/search/cs?searchtype=author&amp;query=Zacharopoulos%2C+G">Georgios Zacharopoulos</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Rama%2C+S">Saketh Rama</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.09222v1-abstract-short" style="display: inline;"> Post-Moore&#39;s law area-constrained systems rely on accelerators to deliver performance enhancements. Coarse grained accelerators can offer substantial domain acceleration, but manual, ad-hoc identification of code to accelerate is prohibitively expensive. Because cycle-accurate simulators and high-level synthesis flows are so time-consuming, manual creation of high-utilization accelerators that exp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.09222v1-abstract-full').style.display = 'inline'; document.getElementById('2111.09222v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.09222v1-abstract-full" style="display: none;"> Post-Moore&#39;s law area-constrained systems rely on accelerators to deliver performance enhancements. Coarse grained accelerators can offer substantial domain acceleration, but manual, ad-hoc identification of code to accelerate is prohibitively expensive. Because cycle-accurate simulators and high-level synthesis flows are so time-consuming, manual creation of high-utilization accelerators that exploit control and data flow patterns at optimal granularities is rarely successful. To address these challenges, we present AccelMerger, the first automated methodology to create coarse grained, control- and data-flow-rich, merged accelerators. AccelMerger uses sequence alignment matching to recognize similar function call-graphs and loops, and neural networks to quickly evaluate their post-HLS characteristics. It accurately identifies which functions to accelerate, and it merges accelerators to respect an area budget and to accommodate system communication characteristics like latency and bandwidth. Merging two accelerators can save as much as 99% of the area of one. The space saved is used by a globally optimal integer linear program to allocate more accelerators for increased performance. We demonstate AccelMerger&#39;s effectiveness using HLS flows without any manual effort to fine-tune the resulting designs. On FPGA-based systems, AccelMerger yields application performance improvements of up to 16.7x over software implementations, and 1.91x on average with respect to state-of-the-art early-stage design space exploration tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.09222v1-abstract-full').style.display = 'none'; document.getElementById('2111.09222v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.04807">arXiv:2111.04807</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.04807">pdf</a>, <a href="https://arxiv.org/ps/2111.04807">ps</a>, <a href="https://arxiv.org/format/2111.04807">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Approaches for Out-Of-Distribution Dermoscopic Lesion Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Torop%2C+M">Max Torop</a>, <a href="/search/cs?searchtype=author&amp;query=Ghimire%2C+S">Sandesh Ghimire</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenqian Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D+H">Dana H. Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Camps%2C+O">Octavia Camps</a>, <a href="/search/cs?searchtype=author&amp;query=Rajadhyaksha%2C+M">Milind Rajadhyaksha</a>, <a href="/search/cs?searchtype=author&amp;query=Dy%2C+J">Jennifer Dy</a>, <a href="/search/cs?searchtype=author&amp;query=Kose%2C+K">Kivanc Kose</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.04807v1-abstract-short" style="display: inline;"> There are limited works showing the efficacy of unsupervised Out-of-Distribution (OOD) methods on complex medical data. Here, we present preliminary findings of our unsupervised OOD detection algorithm, SimCLR-LOF, as well as a recent state of the art approach (SSD), applied on medical images. SimCLR-LOF learns semantically meaningful features using SimCLR and uses LOF for scoring if a test sample&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.04807v1-abstract-full').style.display = 'inline'; document.getElementById('2111.04807v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.04807v1-abstract-full" style="display: none;"> There are limited works showing the efficacy of unsupervised Out-of-Distribution (OOD) methods on complex medical data. Here, we present preliminary findings of our unsupervised OOD detection algorithm, SimCLR-LOF, as well as a recent state of the art approach (SSD), applied on medical images. SimCLR-LOF learns semantically meaningful features using SimCLR and uses LOF for scoring if a test sample is OOD. We evaluated on the multi-source International Skin Imaging Collaboration (ISIC) 2019 dataset, and show results that are competitive with SSD as well as with recent supervised approaches applied on the same data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.04807v1-abstract-full').style.display = 'none'; document.getElementById('2111.04807v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS: Medical Imaging Meets NeurIPS Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.00364">arXiv:2111.00364</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.00364">pdf</a>, <a href="https://arxiv.org/format/2111.00364">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Sustainable AI: Environmental Implications, Challenges and Opportunities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Raghavendra%2C+R">Ramya Raghavendra</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Acun%2C+B">Bilge Acun</a>, <a href="/search/cs?searchtype=author&amp;query=Ardalani%2C+N">Newsha Ardalani</a>, <a href="/search/cs?searchtype=author&amp;query=Maeng%2C+K">Kiwan Maeng</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+G">Gloria Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Behram%2C+F+A">Fiona Aga Behram</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">James Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+C">Charles Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Gschwind%2C+M">Michael Gschwind</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+A">Anurag Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Ott%2C+M">Myle Ott</a>, <a href="/search/cs?searchtype=author&amp;query=Melnikov%2C+A">Anastasia Melnikov</a>, <a href="/search/cs?searchtype=author&amp;query=Candido%2C+S">Salvatore Candido</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Chauhan%2C+G">Geeta Chauhan</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+B">Benjamin Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H+S">Hsien-Hsin S. Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Akyildiz%2C+B">Bugra Akyildiz</a>, <a href="/search/cs?searchtype=author&amp;query=Balandat%2C+M">Maximilian Balandat</a>, <a href="/search/cs?searchtype=author&amp;query=Spisak%2C+J">Joe Spisak</a>, <a href="/search/cs?searchtype=author&amp;query=Jain%2C+R">Ravi Jain</a>, <a href="/search/cs?searchtype=author&amp;query=Rabbat%2C+M">Mike Rabbat</a>, <a href="/search/cs?searchtype=author&amp;query=Hazelwood%2C+K">Kim Hazelwood</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.00364v2-abstract-short" style="display: inline;"> This paper explores the environmental impact of the super-linear growth trends for AI from a holistic perspective, spanning Data, Algorithms, and System Hardware. We characterize the carbon footprint of AI computing by examining the model development cycle across industry-scale machine learning use cases and, at the same time, considering the life cycle of system hardware. Taking a step further, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.00364v2-abstract-full').style.display = 'inline'; document.getElementById('2111.00364v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.00364v2-abstract-full" style="display: none;"> This paper explores the environmental impact of the super-linear growth trends for AI from a holistic perspective, spanning Data, Algorithms, and System Hardware. We characterize the carbon footprint of AI computing by examining the model development cycle across industry-scale machine learning use cases and, at the same time, considering the life cycle of system hardware. Taking a step further, we capture the operational and manufacturing carbon footprint of AI computing and present an end-to-end analysis for what and how hardware-software design and at-scale optimization can help reduce the overall carbon footprint of AI. Based on the industry experience and lessons learned, we share the key challenges and chart out important development directions across the many dimensions of AI. We hope the key messages and insights presented in this paper can inspire the community to advance the field of AI in an environmentally-responsible manner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.00364v2-abstract-full').style.display = 'none'; document.getElementById('2111.00364v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.12392">arXiv:2110.12392</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.12392">pdf</a>, <a href="https://arxiv.org/format/2110.12392">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Variation is the Norm: Brain State Dynamics Evoked By Emotional Video Clips </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Singh%2C+A">Ashutosh Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Westlin%2C+C">Christiana Westlin</a>, <a href="/search/cs?searchtype=author&amp;query=Eisenbarth%2C+H">Hedwig Eisenbarth</a>, <a href="/search/cs?searchtype=author&amp;query=Losin%2C+E+A+R">Elizabeth A. Reynolds Losin</a>, <a href="/search/cs?searchtype=author&amp;query=Andrews-Hanna%2C+J+R">Jessica R. Andrews-Hanna</a>, <a href="/search/cs?searchtype=author&amp;query=Wager%2C+T+D">Tor D. Wager</a>, <a href="/search/cs?searchtype=author&amp;query=Satpute%2C+A+B">Ajay B. Satpute</a>, <a href="/search/cs?searchtype=author&amp;query=Barrett%2C+L+F">Lisa Feldman Barrett</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D+H">Dana H. Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Erdogmus%2C+D">Deniz Erdogmus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.12392v1-abstract-short" style="display: inline;"> For the last several decades, emotion research has attempted to identify a &#34;biomarker&#34; or consistent pattern of brain activity to characterize a single category of emotion (e.g., fear) that will remain consistent across all instances of that category, regardless of individual and context. In this study, we investigated variation rather than consistency during emotional experiences while people wat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.12392v1-abstract-full').style.display = 'inline'; document.getElementById('2110.12392v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.12392v1-abstract-full" style="display: none;"> For the last several decades, emotion research has attempted to identify a &#34;biomarker&#34; or consistent pattern of brain activity to characterize a single category of emotion (e.g., fear) that will remain consistent across all instances of that category, regardless of individual and context. In this study, we investigated variation rather than consistency during emotional experiences while people watched video clips chosen to evoke instances of specific emotion categories. Specifically, we developed a sequential probabilistic approach to model the temporal dynamics in a participant&#39;s brain activity during video viewing. We characterized brain states during these clips as distinct state occupancy periods between state transitions in blood oxygen level dependent (BOLD) signal patterns. We found substantial variation in the state occupancy probability distributions across individuals watching the same video, supporting the hypothesis that when it comes to the brain correlates of emotional experience, variation may indeed be the norm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.12392v1-abstract-full').style.display = 'none'; document.getElementById('2110.12392v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.01188">arXiv:2109.01188</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.01188">pdf</a>, <a href="https://arxiv.org/format/2109.01188">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> NVMExplorer: A Framework for Cross-Stack Comparisons of Embedded Non-Volatile Memories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pentecost%2C+L">Lillian Pentecost</a>, <a href="/search/cs?searchtype=author&amp;query=Hankin%2C+A">Alexander Hankin</a>, <a href="/search/cs?searchtype=author&amp;query=Donato%2C+M">Marco Donato</a>, <a href="/search/cs?searchtype=author&amp;query=Hempstead%2C+M">Mark Hempstead</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.01188v2-abstract-short" style="display: inline;"> Repeated off-chip memory accesses to DRAM drive up operating power for data-intensive applications, and SRAM technology scaling and leakage power limits the efficiency of embedded memories. Future on-chip storage will need higher density and energy efficiency, and the actively expanding field of emerging, embeddable non-volatile memory (eNVM) technologies is providing many potential candidates to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.01188v2-abstract-full').style.display = 'inline'; document.getElementById('2109.01188v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.01188v2-abstract-full" style="display: none;"> Repeated off-chip memory accesses to DRAM drive up operating power for data-intensive applications, and SRAM technology scaling and leakage power limits the efficiency of embedded memories. Future on-chip storage will need higher density and energy efficiency, and the actively expanding field of emerging, embeddable non-volatile memory (eNVM) technologies is providing many potential candidates to satisfy this need. Each technology proposal presents distinct trade-offs in terms of density, read, write, and reliability characteristics, and we present a comprehensive framework for navigating and quantifying these design trade-offs alongside realistic system constraints and application-level impacts. This work evaluates eNVM-based storage for a range of application and system contexts including machine learning on the edge, graph analytics, and general purpose cache hierarchy, in addition to describing a freely available (http://nvmexplorer.seas.harvard.edu/) set of tools for application experts, system designers, and device experts to better understand, compare, and quantify the next generation of embedded memory solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.01188v2-abstract-full').style.display = 'none'; document.getElementById('2109.01188v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 14 figures, 3 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> B.3; I.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.11757">arXiv:2106.11757</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.11757">pdf</a>, <a href="https://arxiv.org/format/2106.11757">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Application-driven Design Exploration for Dense Ferroelectric Embedded Non-volatile Memories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sharifi%2C+M+M">Mohammad Mehdi Sharifi</a>, <a href="/search/cs?searchtype=author&amp;query=Pentecost%2C+L">Lillian Pentecost</a>, <a href="/search/cs?searchtype=author&amp;query=Rajaei%2C+R">Ramin Rajaei</a>, <a href="/search/cs?searchtype=author&amp;query=Kazemi%2C+A">Arman Kazemi</a>, <a href="/search/cs?searchtype=author&amp;query=Lou%2C+Q">Qiuwen Lou</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+K">Kai Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X+S">X. Sharon Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Niemier%2C+M">Michael Niemier</a>, <a href="/search/cs?searchtype=author&amp;query=Donato%2C+M">Marco Donato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.11757v1-abstract-short" style="display: inline;"> The memory wall bottleneck is a key challenge across many data-intensive applications. Multi-level FeFET-based embedded non-volatile memories are a promising solution for denser and more energy-efficient on-chip memory. However, reliable multi-level cell storage requires careful optimizations to minimize the design overhead costs. In this work, we investigate the interplay between FeFET device cha&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.11757v1-abstract-full').style.display = 'inline'; document.getElementById('2106.11757v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.11757v1-abstract-full" style="display: none;"> The memory wall bottleneck is a key challenge across many data-intensive applications. Multi-level FeFET-based embedded non-volatile memories are a promising solution for denser and more energy-efficient on-chip memory. However, reliable multi-level cell storage requires careful optimizations to minimize the design overhead costs. In this work, we investigate the interplay between FeFET device characteristics, programming schemes, and memory array architecture, and explore different design choices to optimize performance, energy, area, and accuracy metrics for critical data-intensive workloads. From our cross-stack design exploration, we find that we can store DNN weights and social network graphs at a density of over 8MB/mm^2 and sub-2ns read access latency without loss in application accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.11757v1-abstract-full').style.display = 'none'; document.getElementById('2106.11757v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ISLPED 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06089">arXiv:2106.06089</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.06089">pdf</a>, <a href="https://arxiv.org/format/2106.06089">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gradient Disaggregation: Breaking Privacy in Federated Learning by Reconstructing the User Participant Matrix </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lam%2C+M">Maximilian Lam</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Reddi%2C+V+J">Vijay Janapa Reddi</a>, <a href="/search/cs?searchtype=author&amp;query=Mitzenmacher%2C+M">Michael Mitzenmacher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06089v1-abstract-short" style="display: inline;"> We show that aggregated model updates in federated learning may be insecure. An untrusted central server may disaggregate user updates from sums of updates across participants given repeated observations, enabling the server to recover privileged information about individual users&#39; private training data via traditional gradient inference attacks. Our method revolves around reconstructing participa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06089v1-abstract-full').style.display = 'inline'; document.getElementById('2106.06089v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06089v1-abstract-full" style="display: none;"> We show that aggregated model updates in federated learning may be insecure. An untrusted central server may disaggregate user updates from sums of updates across participants given repeated observations, enabling the server to recover privileged information about individual users&#39; private training data via traditional gradient inference attacks. Our method revolves around reconstructing participant information (e.g: which rounds of training users participated in) from aggregated model updates by leveraging summary information from device analytics commonly used to monitor, debug, and manage federated learning systems. Our attack is parallelizable and we successfully disaggregate user updates on settings with up to thousands of participants. We quantitatively and qualitatively demonstrate significant improvements in the capability of various inference attacks on the disaggregated updates. Our attack enables the attribution of learned properties to individual users, violating anonymity, and shows that a determined central server may undermine the secure aggregation protocol to break individual users&#39; data privacy in federated learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06089v1-abstract-full').style.display = 'none'; document.getElementById('2106.06089v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.12882">arXiv:2105.12882</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2105.12882">pdf</a>, <a href="https://arxiv.org/format/2105.12882">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> MAVFI: An End-to-End Fault Analysis Framework with Anomaly Detection and Recovery for Micro Aerial Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsiao%2C+Y">Yu-Shun Hsiao</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zishen Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+T">Tianyu Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Ghosal%2C+R">Radhika Ghosal</a>, <a href="/search/cs?searchtype=author&amp;query=Mahmoud%2C+A">Abdulrahman Mahmoud</a>, <a href="/search/cs?searchtype=author&amp;query=Raychowdhury%2C+A">Arijit Raychowdhury</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Reddi%2C+V+J">Vijay Janapa Reddi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.12882v3-abstract-short" style="display: inline;"> Safety and resilience are critical for autonomous unmanned aerial vehicles (UAVs). We introduce MAVFI, the micro aerial vehicles (MAVs) resilience analysis methodology to assess the effect of silent data corruption (SDC) on UAVs&#39; mission metrics, such as flight time and success rate, for accurately measuring system resilience. To enhance the safety and resilience of robot systems bound by size, we&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.12882v3-abstract-full').style.display = 'inline'; document.getElementById('2105.12882v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.12882v3-abstract-full" style="display: none;"> Safety and resilience are critical for autonomous unmanned aerial vehicles (UAVs). We introduce MAVFI, the micro aerial vehicles (MAVs) resilience analysis methodology to assess the effect of silent data corruption (SDC) on UAVs&#39; mission metrics, such as flight time and success rate, for accurately measuring system resilience. To enhance the safety and resilience of robot systems bound by size, weight, and power (SWaP), we offer two low-overhead anomaly-based SDC detection and recovery algorithms based on Gaussian statistical models and autoencoder neural networks. Our anomaly error protection techniques are validated in numerous simulated environments. We demonstrate that the autoencoder-based technique can recover up to all failure cases in our studied scenarios with a computational overhead of no more than 0.0062%. Our application-aware resilience analysis framework, MAVFI, can be utilized to comprehensively test the resilience of other Robot Operating System (ROS)-based applications and is publicly available at https://github.com/harvard-edge/MAVBench/tree/mavfi. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.12882v3-abstract-full').style.display = 'none'; document.getElementById('2105.12882v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 9 figures; The first two authors have equal contributions; Accepted as a conference paper in DATE 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.08820">arXiv:2105.08820</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2105.08820">pdf</a>, <a href="https://arxiv.org/format/2105.08820">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> RecPipe: Co-designing Models and Hardware to Jointly Optimize Recommendation Quality and Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Hsia%2C+S">Samuel Hsia</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jeff Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wilkening%2C+M">Mark Wilkening</a>, <a href="/search/cs?searchtype=author&amp;query=Pombra%2C+J">Javin Pombra</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H+S">Hsien-Hsin S. Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.08820v2-abstract-short" style="display: inline;"> Deep learning recommendation systems must provide high quality, personalized content under strict tail-latency targets and high system loads. This paper presents RecPipe, a system to jointly optimize recommendation quality and inference performance. Central to RecPipe is decomposing recommendation models into multi-stage pipelines to maintain quality while reducing compute complexity and exposing&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.08820v2-abstract-full').style.display = 'inline'; document.getElementById('2105.08820v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.08820v2-abstract-full" style="display: none;"> Deep learning recommendation systems must provide high quality, personalized content under strict tail-latency targets and high system loads. This paper presents RecPipe, a system to jointly optimize recommendation quality and inference performance. Central to RecPipe is decomposing recommendation models into multi-stage pipelines to maintain quality while reducing compute complexity and exposing distinct parallelism opportunities. RecPipe implements an inference scheduler to map multi-stage recommendation engines onto commodity, heterogeneous platforms (e.g., CPUs, GPUs).While the hardware-aware scheduling improves ranking efficiency, the commodity platforms suffer from many limitations requiring specialized hardware. Thus, we design RecPipeAccel (RPAccel), a custom accelerator that jointly optimizes quality, tail-latency, and system throughput. RPAc-cel is designed specifically to exploit the distinct design space opened via RecPipe. In particular, RPAccel processes queries in sub-batches to pipeline recommendation stages, implements dual static and dynamic embedding caches, a set of top-k filtering units, and a reconfigurable systolic array. Com-pared to prior-art and at iso-quality, we demonstrate that RPAccel improves latency and throughput by 3x and 6x. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.08820v2-abstract-full').style.display = 'none'; document.getElementById('2105.08820v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.02988">arXiv:2102.02988</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.02988">pdf</a>, <a href="https://arxiv.org/format/2102.02988">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AutoPilot: Automating SoC Design Space Exploration for SWaP Constrained Autonomous UAVs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Krishnan%2C+S">Srivatsan Krishnan</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zishen Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Bhardwaj%2C+K">Kshitij Bhardwaj</a>, <a href="/search/cs?searchtype=author&amp;query=Whatmough%2C+P">Paul Whatmough</a>, <a href="/search/cs?searchtype=author&amp;query=Faust%2C+A">Aleksandra Faust</a>, <a href="/search/cs?searchtype=author&amp;query=Neuman%2C+S">Sabrina Neuman</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Reddi%2C+V+J">Vijay Janapa Reddi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.02988v3-abstract-short" style="display: inline;"> Building domain-specific accelerators for autonomous unmanned aerial vehicles (UAVs) is challenging due to a lack of systematic methodology for designing onboard compute. Balancing a computing system for a UAV requires considering both the cyber (e.g., sensor rate, compute performance) and physical (e.g., payload weight) characteristics that affect overall performance. Iterating over the many comp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.02988v3-abstract-full').style.display = 'inline'; document.getElementById('2102.02988v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.02988v3-abstract-full" style="display: none;"> Building domain-specific accelerators for autonomous unmanned aerial vehicles (UAVs) is challenging due to a lack of systematic methodology for designing onboard compute. Balancing a computing system for a UAV requires considering both the cyber (e.g., sensor rate, compute performance) and physical (e.g., payload weight) characteristics that affect overall performance. Iterating over the many component choices results in a combinatorial explosion of the number of possible combinations: from 10s of thousands to billions, depending on implementation details. Manually selecting combinations of these components is tedious and expensive. To navigate the {cyber-physical design space} efficiently, we introduce \emph{AutoPilot}, a framework that automates full-system UAV co-design. AutoPilot uses Bayesian optimization to navigate a large design space and automatically select a combination of autonomy algorithm and hardware accelerator while considering the cross-product effect of other cyber and physical UAV components. We show that the AutoPilot methodology consistently outperforms general-purpose hardware selections like Xavier NX and Jetson TX2, as well as dedicated hardware accelerators built for autonomous UAVs, across a range of representative scenarios (three different UAV types and three deployment environments). Designs generated by AutoPilot increase the number of missions on average by up to 2.25x, 1.62x, and 1.43x for nano, micro, and mini-UAVs respectively over baselines. Our work demonstrates the need for holistic full-UAV co-design to achieve maximum overall UAV performance and the need for automated flows to simplify the design process for autonomous cyber-physical systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.02988v3-abstract-full').style.display = 'none'; document.getElementById('2102.02988v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.00075">arXiv:2102.00075</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.00075">pdf</a>, <a href="https://arxiv.org/format/2102.00075">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RecSSD: Near Data Processing for Solid State Drive Based Recommendation Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wilkening%2C+M">Mark Wilkening</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Hsia%2C+S">Samuel Hsia</a>, <a href="/search/cs?searchtype=author&amp;query=Trippel%2C+C">Caroline Trippel</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.00075v1-abstract-short" style="display: inline;"> Neural personalized recommendation models are used across a wide variety of datacenter applications including search, social media, and entertainment. State-of-the-art models comprise large embedding tables that have billions of parameters requiring large memory capacities. Unfortunately, large and fast DRAM-based memories levy high infrastructure costs. Conventional SSD-based storage solutions of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.00075v1-abstract-full').style.display = 'inline'; document.getElementById('2102.00075v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.00075v1-abstract-full" style="display: none;"> Neural personalized recommendation models are used across a wide variety of datacenter applications including search, social media, and entertainment. State-of-the-art models comprise large embedding tables that have billions of parameters requiring large memory capacities. Unfortunately, large and fast DRAM-based memories levy high infrastructure costs. Conventional SSD-based storage solutions offer an order of magnitude larger capacity, but have worse read latency and bandwidth, degrading inference performance. RecSSD is a near data processing based SSD memory system customized for neural recommendation inference that reduces end-to-end model inference latency by 2X compared to using COTS SSDs across eight industry-representative models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.00075v1-abstract-full').style.display = 'none'; document.getElementById('2102.00075v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.05928">arXiv:2012.05928</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2012.05928">pdf</a>, <a href="https://arxiv.org/format/2012.05928">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Astrophysics of Galaxies">astro-ph.GA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cosmology and Nongalactic Astrophysics">astro-ph.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1093/mnras/stab164">10.1093/mnras/stab164 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A machine learning approach to galaxy properties: joint redshift-stellar mass probability distributions with Random Forest </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mucesh%2C+S">S. Mucesh</a>, <a href="/search/cs?searchtype=author&amp;query=Hartley%2C+W+G">W. G. Hartley</a>, <a href="/search/cs?searchtype=author&amp;query=Palmese%2C+A">A. Palmese</a>, <a href="/search/cs?searchtype=author&amp;query=Lahav%2C+O">O. Lahav</a>, <a href="/search/cs?searchtype=author&amp;query=Whiteway%2C+L">L. Whiteway</a>, <a href="/search/cs?searchtype=author&amp;query=Bluck%2C+A+F+L">A. F. L. Bluck</a>, <a href="/search/cs?searchtype=author&amp;query=Alarcon%2C+A">A. Alarcon</a>, <a href="/search/cs?searchtype=author&amp;query=Amon%2C+A">A. Amon</a>, <a href="/search/cs?searchtype=author&amp;query=Bechtol%2C+K">K. Bechtol</a>, <a href="/search/cs?searchtype=author&amp;query=Bernstein%2C+G+M">G. M. Bernstein</a>, <a href="/search/cs?searchtype=author&amp;query=Rosell%2C+A+C">A. Carnero Rosell</a>, <a href="/search/cs?searchtype=author&amp;query=Kind%2C+M+C">M. Carrasco Kind</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+A">A. Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Eckert%2C+K">K. Eckert</a>, <a href="/search/cs?searchtype=author&amp;query=Everett%2C+S">S. Everett</a>, <a href="/search/cs?searchtype=author&amp;query=Gruen%2C+D">D. Gruen</a>, <a href="/search/cs?searchtype=author&amp;query=Gruendl%2C+R+A">R. A. Gruendl</a>, <a href="/search/cs?searchtype=author&amp;query=Harrison%2C+I">I. Harrison</a>, <a href="/search/cs?searchtype=author&amp;query=Huff%2C+E+M">E. M. Huff</a>, <a href="/search/cs?searchtype=author&amp;query=Kuropatkin%2C+N">N. Kuropatkin</a>, <a href="/search/cs?searchtype=author&amp;query=Sevilla-Noarbe%2C+I">I. Sevilla-Noarbe</a>, <a href="/search/cs?searchtype=author&amp;query=Sheldon%2C+E">E. Sheldon</a>, <a href="/search/cs?searchtype=author&amp;query=Yanny%2C+B">B. Yanny</a>, <a href="/search/cs?searchtype=author&amp;query=Aguena%2C+M">M. Aguena</a>, <a href="/search/cs?searchtype=author&amp;query=Allam%2C+S">S. Allam</a> , et al. (50 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.05928v2-abstract-short" style="display: inline;"> We demonstrate that highly accurate joint redshift-stellar mass probability distribution functions (PDFs) can be obtained using the Random Forest (RF) machine learning (ML) algorithm, even with few photometric bands available. As an example, we use the Dark Energy Survey (DES), combined with the COSMOS2015 catalogue for redshifts and stellar masses. We build two ML models: one containing deep phot&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.05928v2-abstract-full').style.display = 'inline'; document.getElementById('2012.05928v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.05928v2-abstract-full" style="display: none;"> We demonstrate that highly accurate joint redshift-stellar mass probability distribution functions (PDFs) can be obtained using the Random Forest (RF) machine learning (ML) algorithm, even with few photometric bands available. As an example, we use the Dark Energy Survey (DES), combined with the COSMOS2015 catalogue for redshifts and stellar masses. We build two ML models: one containing deep photometry in the $griz$ bands, and the second reflecting the photometric scatter present in the main DES survey, with carefully constructed representative training data in each case. We validate our joint PDFs for $10,699$ test galaxies by utilizing the copula probability integral transform and the Kendall distribution function, and their univariate counterparts to validate the marginals. Benchmarked against a basic set-up of the template-fitting code BAGPIPES, our ML-based method outperforms template fitting on all of our predefined performance metrics. In addition to accuracy, the RF is extremely fast, able to compute joint PDFs for a million galaxies in just under $6$ min with consumer computer hardware. Such speed enables PDFs to be derived in real time within analysis codes, solving potential storage issues. As part of this work we have developed GALPRO, a highly intuitive and efficient Python package to rapidly generate multivariate PDFs on-the-fly. GALPRO is documented and available for researchers to use in their cosmology and galaxy evolution studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.05928v2-abstract-full').style.display = 'none'; document.getElementById('2012.05928v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 8 figures, Accepted by MNRAS</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> FERMILAB-PUB-20-653-AE, DES-2020-0542 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Monthly Notices of the Royal Astronomical Society, Volume 502, Issue 2, April 2021, Pages 2770-2786 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.14203">arXiv:2011.14203</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.14203">pdf</a>, <a href="https://arxiv.org/format/2011.14203">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EdgeBERT: Sentence-Level Energy Optimizations for Latency-Aware Multi-Task NLP Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tambe%2C+T">Thierry Tambe</a>, <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Pentecost%2C+L">Lillian Pentecost</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+T">Tianyu Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+E">En-Yu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Donato%2C+M">Marco Donato</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Whatmough%2C+P+N">Paul N. Whatmough</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.14203v5-abstract-short" style="display: inline;"> Transformer-based language models such as BERT provide significant accuracy improvement for a multitude of natural language processing (NLP) tasks. However, their hefty computational and memory demands make them challenging to deploy to resource-constrained edge platforms with strict latency requirements. We present EdgeBERT, an in-depth algorithm-hardware co-design for latency-aware energy optimi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14203v5-abstract-full').style.display = 'inline'; document.getElementById('2011.14203v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.14203v5-abstract-full" style="display: none;"> Transformer-based language models such as BERT provide significant accuracy improvement for a multitude of natural language processing (NLP) tasks. However, their hefty computational and memory demands make them challenging to deploy to resource-constrained edge platforms with strict latency requirements. We present EdgeBERT, an in-depth algorithm-hardware co-design for latency-aware energy optimization for multi-task NLP. EdgeBERT employs entropy-based early exit predication in order to perform dynamic voltage-frequency scaling (DVFS), at a sentence granularity, for minimal energy consumption while adhering to a prescribed target latency. Computation and memory footprint overheads are further alleviated by employing a calibrated combination of adaptive attention span, selective network pruning, and floating-point quantization. Furthermore, in order to maximize the synergistic benefits of these algorithms in always-on and intermediate edge computing settings, we specialize a 12nm scalable hardware accelerator system, integrating a fast-switching low-dropout voltage regulator (LDO), an all-digital phase-locked loop (ADPLL), as well as, high-density embedded non-volatile memories (eNVMs) wherein the sparse floating-point bit encodings of the shared multi-task parameters are carefully stored. Altogether, latency-aware multi-task NLP inference acceleration on the EdgeBERT hardware system generates up to 7x, 2.5x, and 53x lower energy compared to the conventional inference without early stopping, the latency-unbounded early exit approach, and CUDA adaptations on an Nvidia Jetson Tegra X2 mobile GPU, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14203v5-abstract-full').style.display = 'none'; document.getElementById('2011.14203v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages plus references. Paper to appear at the 54th IEEE/ACM International Symposium on Microarchitecture (MICRO 2021)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.02839">arXiv:2011.02839</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.02839">pdf</a>, <a href="https://arxiv.org/format/2011.02839">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Chasing Carbon: The Elusive Environmental Footprint of Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+Y+G">Young Geun Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+S">Sylvia Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Tse%2C+J">Jordan Tse</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H+S">Hsien-Hsin S. Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.02839v1-abstract-short" style="display: inline;"> Given recent algorithm, software, and hardware innovation, computing has enabled a plethora of new applications. As computing becomes increasingly ubiquitous, however, so does its environmental impact. This paper brings the issue to the attention of computer-systems researchers. Our analysis, built on industry-reported characterization, quantifies the environmental effects of computing in terms of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.02839v1-abstract-full').style.display = 'inline'; document.getElementById('2011.02839v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.02839v1-abstract-full" style="display: none;"> Given recent algorithm, software, and hardware innovation, computing has enabled a plethora of new applications. As computing becomes increasingly ubiquitous, however, so does its environmental impact. This paper brings the issue to the attention of computer-systems researchers. Our analysis, built on industry-reported characterization, quantifies the environmental effects of computing in terms of carbon emissions. Broadly, carbon emissions have two sources: operational energy consumption, and hardware manufacturing and infrastructure. Although carbon emissions from the former are decreasing thanks to algorithmic, software, and hardware innovations that boost performance and power efficiency, the overall carbon footprint of computer systems continues to grow. This work quantifies the carbon output of computer systems to show that most emissions related to modern mobile and data-center equipment come from hardware manufacturing and infrastructure. We therefore outline future directions for minimizing the environmental impact of computing systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.02839v1-abstract-full').style.display = 'none'; document.getElementById('2011.02839v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in IEEE International Symposium on High-Performance Computer Architecture (HPCA 2021)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.05037">arXiv:2010.05037</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.05037">pdf</a>, <a href="https://arxiv.org/format/2010.05037">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Cross-Stack Workload Characterization of Deep Recommendation Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsia%2C+S">Samuel Hsia</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Wilkening%2C+M">Mark Wilkening</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Carole-Jean Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.05037v1-abstract-short" style="display: inline;"> Deep learning based recommendation systems form the backbone of most personalized cloud services. Though the computer architecture community has recently started to take notice of deep recommendation inference, the resulting solutions have taken wildly different approaches - ranging from near memory processing to at-scale optimizations. To better design future hardware systems for deep recommendat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.05037v1-abstract-full').style.display = 'inline'; document.getElementById('2010.05037v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.05037v1-abstract-full" style="display: none;"> Deep learning based recommendation systems form the backbone of most personalized cloud services. Though the computer architecture community has recently started to take notice of deep recommendation inference, the resulting solutions have taken wildly different approaches - ranging from near memory processing to at-scale optimizations. To better design future hardware systems for deep recommendation inference, we must first systematically examine and characterize the underlying systems-level impact of design decisions across the different levels of the execution stack. In this paper, we characterize eight industry-representative deep recommendation models at three different levels of the execution stack: algorithms and software, systems platforms, and hardware microarchitectures. Through this cross-stack characterization, we first show that system deployment choices (i.e., CPUs or GPUs, batch size granularity) can give us up to 15x speedup. To better understand the bottlenecks for further optimization, we look at both software operator usage breakdown and CPU frontend and backend microarchitectural inefficiencies. Finally, we model the correlation between key algorithmic model architecture features and hardware bottlenecks, revealing the absence of a single dominant algorithmic component behind each hardware bottleneck. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.05037v1-abstract-full').style.display = 'none'; document.getElementById('2010.05037v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in 2020 IEEE International Symposium on Workload Characterization (IISWC)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.12856">arXiv:2009.12856</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2009.12856">pdf</a>, <a href="https://arxiv.org/format/2009.12856">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Earth and Planetary Astrophysics">astro-ph.EP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1088/1538-3873/abcaea">10.1088/1538-3873/abcaea <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Machine Learning for Searching the Dark Energy Survey for Trans-Neptunian Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Henghes%2C+B">B. Henghes</a>, <a href="/search/cs?searchtype=author&amp;query=Lahav%2C+O">O. Lahav</a>, <a href="/search/cs?searchtype=author&amp;query=Gerdes%2C+D+W">D. W. Gerdes</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+E">E. Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Morgan%2C+R">R. Morgan</a>, <a href="/search/cs?searchtype=author&amp;query=Abbott%2C+T+M+C">T. M. C. Abbott</a>, <a href="/search/cs?searchtype=author&amp;query=Aguena%2C+M">M. Aguena</a>, <a href="/search/cs?searchtype=author&amp;query=Allam%2C+S">S. Allam</a>, <a href="/search/cs?searchtype=author&amp;query=Annis%2C+J">J. Annis</a>, <a href="/search/cs?searchtype=author&amp;query=Avila%2C+S">S. Avila</a>, <a href="/search/cs?searchtype=author&amp;query=Bertin%2C+E">E. Bertin</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">D. Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Burke%2C+D+L">D. L. Burke</a>, <a href="/search/cs?searchtype=author&amp;query=CarneroRosell%2C+A">A. CarneroRosell</a>, <a href="/search/cs?searchtype=author&amp;query=CarrascoKind%2C+M">M. CarrascoKind</a>, <a href="/search/cs?searchtype=author&amp;query=Carretero%2C+J">J. Carretero</a>, <a href="/search/cs?searchtype=author&amp;query=Conselice%2C+C">C. Conselice</a>, <a href="/search/cs?searchtype=author&amp;query=Costanzi%2C+M">M. Costanzi</a>, <a href="/search/cs?searchtype=author&amp;query=da+Costa%2C+L+N">L. N. da Costa</a>, <a href="/search/cs?searchtype=author&amp;query=DeVicente%2C+J">J. DeVicente</a>, <a href="/search/cs?searchtype=author&amp;query=Desai%2C+S">S. Desai</a>, <a href="/search/cs?searchtype=author&amp;query=Diehl%2C+H+T">H. T. Diehl</a>, <a href="/search/cs?searchtype=author&amp;query=Doel%2C+P">P. Doel</a>, <a href="/search/cs?searchtype=author&amp;query=Everett%2C+S">S. Everett</a>, <a href="/search/cs?searchtype=author&amp;query=Ferrero%2C+I">I. Ferrero</a> , et al. (34 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.12856v2-abstract-short" style="display: inline;"> In this paper we investigate how implementing machine learning could improve the efficiency of the search for Trans-Neptunian Objects (TNOs) within Dark Energy Survey (DES) data when used alongside orbit fitting. The discovery of multiple TNOs that appear to show a similarity in their orbital parameters has led to the suggestion that one or more undetected planets, an as yet undiscovered &#34;Planet 9&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.12856v2-abstract-full').style.display = 'inline'; document.getElementById('2009.12856v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.12856v2-abstract-full" style="display: none;"> In this paper we investigate how implementing machine learning could improve the efficiency of the search for Trans-Neptunian Objects (TNOs) within Dark Energy Survey (DES) data when used alongside orbit fitting. The discovery of multiple TNOs that appear to show a similarity in their orbital parameters has led to the suggestion that one or more undetected planets, an as yet undiscovered &#34;Planet 9&#34;, may be present in the outer Solar System. DES is well placed to detect such a planet and has already been used to discover many other TNOs. Here, we perform tests on eight different supervised machine learning algorithms, using a dataset consisting of simulated TNOs buried within real DES noise data. We found that the best performing classifier was the Random Forest which, when optimised, performed well at detecting the rare objects. We achieve an area under the receiver operating characteristic (ROC) curve, (AUC) $= 0.996 \pm 0.001$. After optimizing the decision threshold of the Random Forest, we achieve a recall of 0.96 while maintaining a precision of 0.80. Finally, by using the optimized classifier to pre-select objects, we are able to run the orbit-fitting stage of our detection pipeline five times faster. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.12856v2-abstract-full').style.display = 'none'; document.getElementById('2009.12856v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in PASP, 16 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> PASP 133 014501 (2021) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.00655">arXiv:2009.00655</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2009.00655">pdf</a>, <a href="https://arxiv.org/format/2009.00655">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AI solutions for drafting in Magic: the Gathering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ward%2C+H+N">Henry N. Ward</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D+J">Daniel J. Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Troha%2C+D">Dan Troha</a>, <a href="/search/cs?searchtype=author&amp;query=Mills%2C+B">Bobby Mills</a>, <a href="/search/cs?searchtype=author&amp;query=Khakhalin%2C+A+S">Arseny S. Khakhalin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.00655v3-abstract-short" style="display: inline;"> Drafting in Magic the Gathering is a sub-game within a larger trading card game, where several players progressively build decks by picking cards from a common pool. Drafting poses an interesting problem for game and AI research due to its large search space, mechanical complexity, multiplayer nature, and hidden information. Despite this, drafting remains understudied, in part due to a lack of hig&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.00655v3-abstract-full').style.display = 'inline'; document.getElementById('2009.00655v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.00655v3-abstract-full" style="display: none;"> Drafting in Magic the Gathering is a sub-game within a larger trading card game, where several players progressively build decks by picking cards from a common pool. Drafting poses an interesting problem for game and AI research due to its large search space, mechanical complexity, multiplayer nature, and hidden information. Despite this, drafting remains understudied, in part due to a lack of high-quality, public datasets. To rectify this problem, we present a dataset of over 100,000 simulated, anonymized human drafts collected from Draftsim.com. We also propose four diverse strategies for drafting agents, including a primitive heuristic agent, an expert-tuned complex heuristic agent, a Naive Bayes agent, and a deep neural network agent. We benchmark their ability to emulate human drafting, and show that the deep neural network agent outperforms other agents, while the Naive Bayes and expert-tuned agents outperform simple heuristics. We analyze the accuracy of AI agents across the timeline of a draft, and describe unique strengths and weaknesses for each approach. This work helps to identify next steps in the creation of humanlike drafting agents, and can serve as a benchmark for the next generation of drafting bots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.00655v3-abstract-full').style.display = 'none'; document.getElementById('2009.00655v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.00119">arXiv:2008.00119</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2008.00119">pdf</a>, <a href="https://arxiv.org/format/2008.00119">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CorrSigNet: Learning CORRelated Prostate Cancer SIGnatures from Radiology and Pathology Images for Improved Computer Aided Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bhattacharya%2C+I">Indrani Bhattacharya</a>, <a href="/search/cs?searchtype=author&amp;query=Seetharaman%2C+A">Arun Seetharaman</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+W">Wei Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Sood%2C+R">Rewa Sood</a>, <a href="/search/cs?searchtype=author&amp;query=Kunder%2C+C+A">Christian A. Kunder</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+R+E">Richard E. Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Soerensen%2C+S+J+C">Simon John Christoph Soerensen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J+B">Jeffrey B. Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ghanouni%2C+P">Pejman Ghanouni</a>, <a href="/search/cs?searchtype=author&amp;query=Teslovich%2C+N+C">Nikola C. Teslovich</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+J+D">James D. Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Sonn%2C+G+A">Geoffrey A. Sonn</a>, <a href="/search/cs?searchtype=author&amp;query=Rusu%2C+M">Mirabela Rusu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.00119v1-abstract-short" style="display: inline;"> Magnetic Resonance Imaging (MRI) is widely used for screening and staging prostate cancer. However, many prostate cancers have subtle features which are not easily identifiable on MRI, resulting in missed diagnoses and alarming variability in radiologist interpretation. Machine learning models have been developed in an effort to improve cancer identification, but current models localize cancer usi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.00119v1-abstract-full').style.display = 'inline'; document.getElementById('2008.00119v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.00119v1-abstract-full" style="display: none;"> Magnetic Resonance Imaging (MRI) is widely used for screening and staging prostate cancer. However, many prostate cancers have subtle features which are not easily identifiable on MRI, resulting in missed diagnoses and alarming variability in radiologist interpretation. Machine learning models have been developed in an effort to improve cancer identification, but current models localize cancer using MRI-derived features, while failing to consider the disease pathology characteristics observed on resected tissue. In this paper, we propose CorrSigNet, an automated two-step model that localizes prostate cancer on MRI by capturing the pathology features of cancer. First, the model learns MRI signatures of cancer that are correlated with corresponding histopathology features using Common Representation Learning. Second, the model uses the learned correlated MRI features to train a Convolutional Neural Network to localize prostate cancer. The histopathology images are used only in the first step to learn the correlated features. Once learned, these correlated features can be extracted from MRI of new patients (without histopathology or surgery) to localize cancer. We trained and validated our framework on a unique dataset of 75 patients with 806 slices who underwent MRI followed by prostatectomy surgery. We tested our method on an independent test set of 20 prostatectomy patients (139 slices, 24 cancerous lesions, 1.12M pixels) and achieved a per-pixel sensitivity of 0.81, specificity of 0.71, AUC of 0.86 and a per-lesion AUC of $0.96 \pm 0.07$, outperforming the current state-of-the-art accuracy in predicting prostate cancer using MRI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.00119v1-abstract-full').style.display = 'none'; document.getElementById('2008.00119v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to MICCAI 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.00505">arXiv:2006.00505</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.00505">pdf</a>, <a href="https://arxiv.org/format/2006.00505">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Cheetah: Optimizing and Accelerating Homomorphic Encryption for Private Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Reagen%2C+B">Brandon Reagen</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+W">Wooseok Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Ko%2C+Y">Yeongil Ko</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+V">Vincent Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H+S">Hsien-Hsin S. Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.00505v2-abstract-short" style="display: inline;"> As the application of deep learning continues to grow, so does the amount of data used to make predictions. While traditionally, big-data deep learning was constrained by computing performance and off-chip memory bandwidth, a new constraint has emerged: privacy. One solution is homomorphic encryption (HE). Applying HE to the client-cloud model allows cloud services to perform inference directly on&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.00505v2-abstract-full').style.display = 'inline'; document.getElementById('2006.00505v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.00505v2-abstract-full" style="display: none;"> As the application of deep learning continues to grow, so does the amount of data used to make predictions. While traditionally, big-data deep learning was constrained by computing performance and off-chip memory bandwidth, a new constraint has emerged: privacy. One solution is homomorphic encryption (HE). Applying HE to the client-cloud model allows cloud services to perform inference directly on the client&#39;s encrypted data. While HE can meet privacy constraints, it introduces enormous computational challenges and remains impractically slow in current systems. This paper introduces Cheetah, a set of algorithmic and hardware optimizations for HE DNN inference to achieve plaintext DNN inference speeds. Cheetah proposes HE-parameter tuning optimization and operator scheduling optimizations, which together deliver 79x speedup over the state-of-the-art. However, this still falls short of plaintext inference speeds by almost four orders of magnitude. To bridge the remaining performance gap, Cheetah further proposes an accelerator architecture that, when combined with the algorithmic optimizations, approaches plaintext DNN inference speeds. We evaluate several common neural network models (e.g., ResNet50, VGG16, and AlexNet) and show that plaintext-level HE inference for each is feasible with a custom accelerator consuming 30W and 545mm^2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.00505v2-abstract-full').style.display = 'none'; document.getElementById('2006.00505v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Brooks%2C+D&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Brooks%2C+D&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Brooks%2C+D&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10