CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–31 of 31 results for author: <span class="mathjax">Ogras, U</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Ogras%2C+U">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ogras, U"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ogras%2C+U&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ogras, U"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09188">arXiv:2410.09188</a> <span> [<a href="https://arxiv.org/pdf/2410.09188">pdf</a>, <a href="https://arxiv.org/format/2410.09188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> MFIT: Multi-Fidelity Thermal Modeling for 2.5D and 3D Multi-Chiplet Architectures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pfromm%2C+L">Lukas Pfromm</a>, <a href="/search/cs?searchtype=author&query=Kanani%2C+A">Alish Kanani</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+H">Harsh Sharma</a>, <a href="/search/cs?searchtype=author&query=Solanki%2C+P">Parth Solanki</a>, <a href="/search/cs?searchtype=author&query=Tervo%2C+E">Eric Tervo</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaehyun Park</a>, <a href="/search/cs?searchtype=author&query=Doppa%2C+J+R">Janardhan Rao Doppa</a>, <a href="/search/cs?searchtype=author&query=Pande%2C+P+P">Partha Pratim Pande</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09188v3-abstract-short" style="display: inline;"> Rapidly evolving artificial intelligence and machine learning applications require ever-increasing computational capabilities, while monolithic 2D design technologies approach their limits. Heterogeneous integration of smaller chiplets using a 2.5D silicon interposer and 3D packaging has emerged as a promising paradigm to address this limit and meet performance demands. These approaches offer a si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09188v3-abstract-full').style.display = 'inline'; document.getElementById('2410.09188v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09188v3-abstract-full" style="display: none;"> Rapidly evolving artificial intelligence and machine learning applications require ever-increasing computational capabilities, while monolithic 2D design technologies approach their limits. Heterogeneous integration of smaller chiplets using a 2.5D silicon interposer and 3D packaging has emerged as a promising paradigm to address this limit and meet performance demands. These approaches offer a significant cost reduction and higher manufacturing yield than monolithic 2D integrated circuits. However, the compact arrangement and high compute density exacerbate the thermal management challenges, potentially compromising performance. Addressing these thermal modeling challenges is critical, especially as system sizes grow and different design stages require varying levels of accuracy and speed. Since no single thermal modeling technique meets all these needs, this paper introduces MFIT, a range of multi-fidelity thermal models that effectively balance accuracy and speed. These multi-fidelity models can enable efficient design space exploration and runtime thermal management. Our extensive testing on systems with 16, 36, and 64 2.5D integrated chiplets and 16x3 3D integrated chiplets demonstrates that these models can reduce execution times from days to mere seconds and milliseconds with negligible loss in accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09188v3-abstract-full').style.display = 'none'; document.getElementById('2410.09188v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint for MFIT: Multi-Fidelity Thermal Modeling for 2.5D and 3D Multi-Chiplet Architectures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19073">arXiv:2403.19073</a> <span> [<a href="https://arxiv.org/pdf/2403.19073">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Dataflow-Aware PIM-Enabled Manycore Architecture for Deep Learning Workloads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sharma%2C+H">Harsh Sharma</a>, <a href="/search/cs?searchtype=author&query=Narang%2C+G">Gaurav Narang</a>, <a href="/search/cs?searchtype=author&query=Doppa%2C+J+R">Janardhan Rao Doppa</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U">Umit Ogras</a>, <a href="/search/cs?searchtype=author&query=Pande%2C+P+P">Partha Pratim Pande</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19073v1-abstract-short" style="display: inline;"> Processing-in-memory (PIM) has emerged as an enabler for the energy-efficient and high-performance acceleration of deep learning (DL) workloads. Resistive random-access memory (ReRAM) is one of the most promising technologies to implement PIM. However, as the complexity of Deep convolutional neural networks (DNNs) grows, we need to design a manycore architecture with multiple ReRAM-based processin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19073v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19073v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19073v1-abstract-full" style="display: none;"> Processing-in-memory (PIM) has emerged as an enabler for the energy-efficient and high-performance acceleration of deep learning (DL) workloads. Resistive random-access memory (ReRAM) is one of the most promising technologies to implement PIM. However, as the complexity of Deep convolutional neural networks (DNNs) grows, we need to design a manycore architecture with multiple ReRAM-based processing elements (PEs) on a single chip. Existing PIM-based architectures mostly focus on computation while ignoring the role of communication. ReRAM-based tiled manycore architectures often involve many Processing Elements (PEs), which need to be interconnected via an efficient on-chip communication infrastructure. Simply allocating more resources (ReRAMs) to speed up only computation is ineffective if the communication infrastructure cannot keep up with it. In this paper, we highlight the design principles of a dataflow-aware PIM-enabled manycore platform tailor-made for various types of DL workloads. We consider the design challenges with both 2.5D interposer- and 3D integration-enabled architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19073v1-abstract-full').style.display = 'none'; document.getElementById('2403.19073v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at DATE Conference, Valencia, Spain 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11750">arXiv:2312.11750</a> <span> [<a href="https://arxiv.org/pdf/2312.11750">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> A Heterogeneous Chiplet Architecture for Accelerating End-to-End Transformer Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sharma%2C+H">Harsh Sharma</a>, <a href="/search/cs?searchtype=author&query=Dhingra%2C+P">Pratyush Dhingra</a>, <a href="/search/cs?searchtype=author&query=Doppa%2C+J+R">Janardhan Rao Doppa</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U">Umit Ogras</a>, <a href="/search/cs?searchtype=author&query=Pande%2C+P+P">Partha Pratim Pande</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11750v2-abstract-short" style="display: inline;"> Transformers have revolutionized deep learning and generative modeling, enabling advancements in natural language processing tasks. However, the size of transformer models is increasing continuously, driven by enhanced capabilities across various deep learning tasks. This trend of ever-increasing model size has given rise to new challenges in terms of memory and compute requirements. Conventional… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11750v2-abstract-full').style.display = 'inline'; document.getElementById('2312.11750v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11750v2-abstract-full" style="display: none;"> Transformers have revolutionized deep learning and generative modeling, enabling advancements in natural language processing tasks. However, the size of transformer models is increasing continuously, driven by enhanced capabilities across various deep learning tasks. This trend of ever-increasing model size has given rise to new challenges in terms of memory and compute requirements. Conventional computing platforms, including GPUs, suffer from suboptimal performance due to the memory demands imposed by models with millions/billions of parameters. The emerging chiplet-based platforms provide a new avenue for compute- and data-intensive machine learning (ML) applications enabled by a Network-on-Interposer (NoI). However, designing suitable hardware accelerators for executing Transformer inference workloads is challenging due to a wide variety of complex computing kernels in the Transformer architecture. In this paper, we leverage chiplet-based heterogeneous integration (HI) to design a high-performance and energy-efficient multi-chiplet platform to accelerate transformer workloads. We demonstrate that the proposed NoI architecture caters to the data access patterns inherent in a transformer model. The optimized placement of the chiplets and the associated NoI links and routers enable superior performance compared to the state-of-the-art hardware accelerators. The proposed NoI-based architecture demonstrates scalability across varying transformer models and improves latency and energy efficiency by up to 11.8x and 2.36x, respectively when compared with the existing state-of-the-art architecture HAIMA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11750v2-abstract-full').style.display = 'none'; document.getElementById('2312.11750v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in ACM Transactions on Design Automation of Electronic Systems, 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.13071">arXiv:2303.13071</a> <span> [<a href="https://arxiv.org/pdf/2303.13071">pdf</a>, <a href="https://arxiv.org/format/2303.13071">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PanoHead: Geometry-Aware 3D Full-Head Synthesis in 360$^{\circ}$ </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+S">Sizhe An</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongyi Xu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yichun Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+G">Guoxian Song</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U">Umit Ogras</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Linjie Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.13071v1-abstract-short" style="display: inline;"> Synthesis and reconstruction of 3D human head has gained increasing interests in computer vision and computer graphics recently. Existing state-of-the-art 3D generative adversarial networks (GANs) for 3D human head synthesis are either limited to near-frontal views or hard to preserve 3D consistency in large view angles. We propose PanoHead, the first 3D-aware generative model that enables high-qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13071v1-abstract-full').style.display = 'inline'; document.getElementById('2303.13071v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.13071v1-abstract-full" style="display: none;"> Synthesis and reconstruction of 3D human head has gained increasing interests in computer vision and computer graphics recently. Existing state-of-the-art 3D generative adversarial networks (GANs) for 3D human head synthesis are either limited to near-frontal views or hard to preserve 3D consistency in large view angles. We propose PanoHead, the first 3D-aware generative model that enables high-quality view-consistent image synthesis of full heads in $360^\circ$ with diverse appearance and detailed geometry using only in-the-wild unstructured images for training. At its core, we lift up the representation power of recent 3D GANs and bridge the data alignment gap when training from in-the-wild images with widely distributed views. Specifically, we propose a novel two-stage self-adaptive image alignment for robust 3D GAN training. We further introduce a tri-grid neural volume representation that effectively addresses front-face and back-head feature entanglement rooted in the widely-adopted tri-plane formulation. Our method instills prior knowledge of 2D image segmentation in adversarial learning of 3D neural scene structures, enabling compositable head synthesis in diverse backgrounds. Benefiting from these designs, our method significantly outperforms previous 3D GANs, generating high-quality 3D heads with accurate geometry and diverse appearances, even with long wavy and afro hairstyles, renderable from arbitrary poses. Furthermore, we show that our system can reconstruct full 3D heads from single input images for personalized realistic 3D avatars. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13071v1-abstract-full').style.display = 'none'; document.getElementById('2303.13071v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2023. Project Page:https://sizhean.github.io/panohead</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.12779">arXiv:2302.12779</a> <span> [<a href="https://arxiv.org/pdf/2302.12779">pdf</a>, <a href="https://arxiv.org/format/2302.12779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Machine Learning-based Low Overhead Congestion Control Algorithm for Industrial NoCs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Narayana%2C+S+Y">Shruti Yadav Narayana</a>, <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Ayoub%2C+R">Raid Ayoub</a>, <a href="/search/cs?searchtype=author&query=Kishinevsky%2C+M">Michael Kishinevsky</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.12779v1-abstract-short" style="display: inline;"> Network-on-Chip (NoC) congestion builds up during heavy traffic load and cripples the system performance by stalling the cores. Moreover, congestion leads to wasted link bandwidth due to blocked buffers and bouncing packets. Existing approaches throttle the cores after congestion is detected, reducing efficiency and wasting line bandwidth unnecessarily. In contrast, we propose a lightweight machin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12779v1-abstract-full').style.display = 'inline'; document.getElementById('2302.12779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.12779v1-abstract-full" style="display: none;"> Network-on-Chip (NoC) congestion builds up during heavy traffic load and cripples the system performance by stalling the cores. Moreover, congestion leads to wasted link bandwidth due to blocked buffers and bouncing packets. Existing approaches throttle the cores after congestion is detected, reducing efficiency and wasting line bandwidth unnecessarily. In contrast, we propose a lightweight machine learning-based technique that helps predict congestion in the network. Specifically, our proposed technique collects the features related to traffic at each destination. Then, it labels the features using a novel time reversal approach. The labeled data is used to design a low overhead and an explainable decision tree model used at runtime congestion control. Experimental evaluations with synthetic and real traffic on industrial 6$\times$6 NoC show that the proposed approach increases fairness and memory read bandwidth by up to 114\% with respect to existing congestion control technique while incurring less than 0.01\% of overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12779v1-abstract-full').style.display = 'none'; document.getElementById('2302.12779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The short version of the paper has been accepted in DATE'23</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.08394">arXiv:2210.08394</a> <span> [<a href="https://arxiv.org/pdf/2210.08394">pdf</a>, <a href="https://arxiv.org/format/2210.08394">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> mRI: Multi-modal 3D Human Pose Estimation Dataset using mmWave, RGB-D, and Inertial Sensors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+S">Sizhe An</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yin Li</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U">Umit Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.08394v1-abstract-short" style="display: inline;"> The ability to estimate 3D human body pose and movement, also known as human pose estimation (HPE), enables many applications for home-based health monitoring, such as remote rehabilitation training. Several possible solutions have emerged using sensors ranging from RGB cameras, depth sensors, millimeter-Wave (mmWave) radars, and wearable inertial sensors. Despite previous efforts on datasets and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08394v1-abstract-full').style.display = 'inline'; document.getElementById('2210.08394v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.08394v1-abstract-full" style="display: none;"> The ability to estimate 3D human body pose and movement, also known as human pose estimation (HPE), enables many applications for home-based health monitoring, such as remote rehabilitation training. Several possible solutions have emerged using sensors ranging from RGB cameras, depth sensors, millimeter-Wave (mmWave) radars, and wearable inertial sensors. Despite previous efforts on datasets and benchmarks for HPE, few dataset exploits multiple modalities and focuses on home-based health monitoring. To bridge the gap, we present mRI, a multi-modal 3D human pose estimation dataset with mmWave, RGB-D, and Inertial Sensors. Our dataset consists of over 160k synchronized frames from 20 subjects performing rehabilitation exercises and supports the benchmarks of HPE and action detection. We perform extensive experiments using our dataset and delineate the strength of each modality. We hope that the release of mRI can catalyze the research in pose estimation, multi-modal learning, and action understanding, and more importantly facilitate the applications of home-based health monitoring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.08394v1-abstract-full').style.display = 'none'; document.getElementById('2210.08394v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Thirty-sixth Conference on Neural Information Processing Systems (NeurIPS 2022). Project page: https://sizhean.github.io/mri</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.07914">arXiv:2208.07914</a> <span> [<a href="https://arxiv.org/pdf/2208.07914">pdf</a>, <a href="https://arxiv.org/format/2208.07914">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PD-MORL: Preference-Driven Multi-Objective Reinforcement Learning Algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Basaklar%2C+T">Toygun Basaklar</a>, <a href="/search/cs?searchtype=author&query=Gumussoy%2C+S">Suat Gumussoy</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.07914v3-abstract-short" style="display: inline;"> Multi-objective reinforcement learning (MORL) approaches have emerged to tackle many real-world problems with multiple conflicting objectives by maximizing a joint objective function weighted by a preference vector. These approaches find fixed customized policies corresponding to preference vectors specified during training. However, the design constraints and objectives typically change dynamical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.07914v3-abstract-full').style.display = 'inline'; document.getElementById('2208.07914v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.07914v3-abstract-full" style="display: none;"> Multi-objective reinforcement learning (MORL) approaches have emerged to tackle many real-world problems with multiple conflicting objectives by maximizing a joint objective function weighted by a preference vector. These approaches find fixed customized policies corresponding to preference vectors specified during training. However, the design constraints and objectives typically change dynamically in real-life scenarios. Furthermore, storing a policy for each potential preference is not scalable. Hence, obtaining a set of Pareto front solutions for the entire preference space in a given domain with a single training is critical. To this end, we propose a novel MORL algorithm that trains a single universal network to cover the entire preference space scalable to continuous robotic tasks. The proposed approach, Preference-Driven MORL (PD-MORL), utilizes the preferences as guidance to update the network parameters. It also employs a novel parallelization approach to increase sample efficiency. We show that PD-MORL achieves up to 25% larger hypervolume for challenging continuous control tasks and uses an order of magnitude fewer trainable parameters compared to prior approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.07914v3-abstract-full').style.display = 'none'; document.getElementById('2208.07914v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 8 Figures, 9 Tables, Published as a conference paper at ICLR 2023, https://openreview.net/forum?id=zS9sRyaPFlJ</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.07311">arXiv:2205.07311</a> <span> [<a href="https://arxiv.org/pdf/2205.07311">pdf</a>, <a href="https://arxiv.org/format/2205.07311">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JETCAS.2022.3169899">10.1109/JETCAS.2022.3169899 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> COIN: Communication-Aware In-Memory Acceleration for Graph Convolutional Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Krishnan%2C+G">Gokul Krishnan</a>, <a href="/search/cs?searchtype=author&query=Goksoy%2C+A+A">A. Alper Goksoy</a>, <a href="/search/cs?searchtype=author&query=Nair%2C+G+R">Gopikrishnan Ravindran Nair</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yu Cao</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.07311v1-abstract-short" style="display: inline;"> Graph convolutional networks (GCNs) have shown remarkable learning capabilities when processing graph-structured data found inherently in many application areas. GCNs distribute the outputs of neural networks embedded in each vertex over multiple iterations to take advantage of the relations captured by the underlying graphs. Consequently, they incur a significant amount of computation and irregul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.07311v1-abstract-full').style.display = 'inline'; document.getElementById('2205.07311v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.07311v1-abstract-full" style="display: none;"> Graph convolutional networks (GCNs) have shown remarkable learning capabilities when processing graph-structured data found inherently in many application areas. GCNs distribute the outputs of neural networks embedded in each vertex over multiple iterations to take advantage of the relations captured by the underlying graphs. Consequently, they incur a significant amount of computation and irregular communication overheads, which call for GCN-specific hardware accelerators. To this end, this paper presents a communication-aware in-memory computing architecture (COIN) for GCN hardware acceleration. Besides accelerating the computation using custom compute elements (CE) and in-memory computing, COIN aims at minimizing the intra- and inter-CE communication in GCN operations to optimize the performance and energy efficiency. Experimental evaluations with widely used datasets show up to 105x improvement in energy consumption compared to state-of-the-art GCN accelerator. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.07311v1-abstract-full').style.display = 'none'; document.getElementById('2205.07311v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE Journal on Emerging and Selected Topics in Circuits and Systems (2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.09297">arXiv:2202.09297</a> <span> [<a href="https://arxiv.org/pdf/2202.09297">pdf</a>, <a href="https://arxiv.org/format/2202.09297">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> tinyMAN: Lightweight Energy Manager using Reinforcement Learning for Energy Harvesting Wearable IoT Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Basaklar%2C+T">Toygun Basaklar</a>, <a href="/search/cs?searchtype=author&query=Tuncel%2C+Y">Yigit Tuncel</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.09297v2-abstract-short" style="display: inline;"> Advances in low-power electronics and machine learning techniques lead to many novel wearable IoT devices. These devices have limited battery capacity and computational power. Thus, energy harvesting from ambient sources is a promising solution to power these low-energy wearable devices. They need to manage the harvested energy optimally to achieve energy-neutral operation, which eliminates rechar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.09297v2-abstract-full').style.display = 'inline'; document.getElementById('2202.09297v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.09297v2-abstract-full" style="display: none;"> Advances in low-power electronics and machine learning techniques lead to many novel wearable IoT devices. These devices have limited battery capacity and computational power. Thus, energy harvesting from ambient sources is a promising solution to power these low-energy wearable devices. They need to manage the harvested energy optimally to achieve energy-neutral operation, which eliminates recharging requirements. Optimal energy management is a challenging task due to the dynamic nature of the harvested energy and the battery energy constraints of the target device. To address this challenge, we present a reinforcement learning-based energy management framework, tinyMAN, for resource-constrained wearable IoT devices. The framework maximizes the utilization of the target device under dynamic energy harvesting patterns and battery constraints. Moreover, tinyMAN does not rely on forecasts of the harvested energy which makes it a prediction-free approach. We deployed tinyMAN on a wearable device prototype using TensorFlow Lite for Micro thanks to its small memory footprint of less than 100 KB. Our evaluations show that tinyMAN achieves less than 2.36 ms and 27.75 $渭$J while maintaining up to 45% higher utility compared to prior approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.09297v2-abstract-full').style.display = 'none'; document.getElementById('2202.09297v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures, accepted as "Full Paper" for the 2022 tinyML Research Symposium</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.08980">arXiv:2112.08980</a> <span> [<a href="https://arxiv.org/pdf/2112.08980">pdf</a>, <a href="https://arxiv.org/format/2112.08980">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Performant, Multi-objective Scheduling of Highly Interleaved Task Graphs on Heterogeneous System on Chip Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mack%2C+J">Joshua Mack</a>, <a href="/search/cs?searchtype=author&query=Arda%2C+S+E">Samet E. Arda</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a>, <a href="/search/cs?searchtype=author&query=Akoglu%2C+A">Ali Akoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.08980v1-abstract-short" style="display: inline;"> Performance-, power-, and energy-aware scheduling techniques play an essential role in optimally utilizing processing elements (PEs) of heterogeneous systems. List schedulers, a class of low-complexity static schedulers, have commonly been used in static execution scenarios. However, list schedulers are not suitable for runtime decision making, particularly when multiple concurrent applications ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.08980v1-abstract-full').style.display = 'inline'; document.getElementById('2112.08980v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.08980v1-abstract-full" style="display: none;"> Performance-, power-, and energy-aware scheduling techniques play an essential role in optimally utilizing processing elements (PEs) of heterogeneous systems. List schedulers, a class of low-complexity static schedulers, have commonly been used in static execution scenarios. However, list schedulers are not suitable for runtime decision making, particularly when multiple concurrent applications are interleaved dynamically. For such cases, the static task execution times and expectation of idle PEs assumed by list schedulers lead to inefficient system utilization and poor performance. To address this problem, we present techniques for optimizing execution of list scheduling algorithms in dynamic runtime scenarios via a family of algorithms inspired by the well-known heterogeneous earliest finish time (HEFT) list scheduler. Through dynamically arriving, realistic workload scenarios that are simulated in an open-source discrete event heterogeneous SoC simulator, we exhaustively evaluate each of the proposed algorithms across two SoCs modeled after the Xilinx Zynq Ultrascale+ ZCU102 and O-Droid XU3 development boards. Altogether, depending on the chosen variant in this family of algorithms, we are able to achieve an up to 39% execution time improvement, up to 7.24x algorithmic speedup, or up to 30% energy consumption improvement compared to the baseline HEFT implementation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.08980v1-abstract-full').style.display = 'none'; document.getElementById('2112.08980v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 2 pages of appendix, 14 figures including appendix. Accepted for publication in IEEE Transactions on Parallel and Distributed Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.11069">arXiv:2109.11069</a> <span> [<a href="https://arxiv.org/pdf/2109.11069">pdf</a>, <a href="https://arxiv.org/format/2109.11069">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LES.2021.3110426">10.1109/LES.2021.3110426 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> DAS: Dynamic Adaptive Scheduling for Energy-Efficient Heterogeneous SoCs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Goksoy%2C+A+A">A. Alper Goksoy</a>, <a href="/search/cs?searchtype=author&query=Krishnakumar%2C+A">Anish Krishnakumar</a>, <a href="/search/cs?searchtype=author&query=Hassan%2C+M+S">Md Sahil Hassan</a>, <a href="/search/cs?searchtype=author&query=Farcas%2C+A+J">Allen J. Farcas</a>, <a href="/search/cs?searchtype=author&query=Akoglu%2C+A">Ali Akoglu</a>, <a href="/search/cs?searchtype=author&query=Marculescu%2C+R">Radu Marculescu</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.11069v1-abstract-short" style="display: inline;"> Domain-specific systems-on-chip (DSSoCs) aim at bridging the gap between application-specific integrated circuits (ASICs) and general-purpose processors. Traditional operating system (OS) schedulers can undermine the potential of DSSoCs since their execution times can be orders of magnitude larger than the execution time of the task itself. To address this problem, we propose a dynamic adaptive sc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.11069v1-abstract-full').style.display = 'inline'; document.getElementById('2109.11069v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.11069v1-abstract-full" style="display: none;"> Domain-specific systems-on-chip (DSSoCs) aim at bridging the gap between application-specific integrated circuits (ASICs) and general-purpose processors. Traditional operating system (OS) schedulers can undermine the potential of DSSoCs since their execution times can be orders of magnitude larger than the execution time of the task itself. To address this problem, we propose a dynamic adaptive scheduling (DAS) framework that combines the benefits of a fast (low-overhead) scheduler and a slow (sophisticated, high-performance but high-overhead) scheduler. Experiments with five real-world streaming applications show that DAS consistently outperforms both the fast and slow schedulers. For 40 different workloads, DAS achieves on average 1.29x speedup and 45% lower EDP compared to the sophisticated scheduler at low data rates and 1.28x speedup and 37% lower EDP than the fast scheduler when the workload complexity increases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.11069v1-abstract-full').style.display = 'none'; document.getElementById('2109.11069v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 2 tables, 3 figures, 1 algorithm, Accepted for publication in IEEE Embedded Systems Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.09534">arXiv:2108.09534</a> <span> [<a href="https://arxiv.org/pdf/2108.09534">pdf</a>, <a href="https://arxiv.org/format/2108.09534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> </div> </div> <p class="title is-5 mathjax"> Theoretical Analysis and Evaluation of NoCs with Weighted Round-Robin Arbitration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+J">Jie Tong</a>, <a href="/search/cs?searchtype=author&query=Ayoub%2C+R">Raid Ayoub</a>, <a href="/search/cs?searchtype=author&query=Kishinevsky%2C+M">Michael Kishinevsky</a>, <a href="/search/cs?searchtype=author&query=Abousamra%2C+A">Ahmed Abousamra</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.09534v2-abstract-short" style="display: inline;"> Fast and accurate performance analysis techniques are essential in early design space exploration and pre-silicon evaluations, including software eco-system development. In particular, on-chip communication continues to play an increasingly important role as the many-core processors scale up. This paper presents the first performance analysis technique that targets networks-on-chip (NoCs) that emp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.09534v2-abstract-full').style.display = 'inline'; document.getElementById('2108.09534v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.09534v2-abstract-full" style="display: none;"> Fast and accurate performance analysis techniques are essential in early design space exploration and pre-silicon evaluations, including software eco-system development. In particular, on-chip communication continues to play an increasingly important role as the many-core processors scale up. This paper presents the first performance analysis technique that targets networks-on-chip (NoCs) that employ weighted round-robin (WRR) arbitration. Besides fairness, WRR arbitration provides flexibility in allocating bandwidth proportionally to the importance of the traffic classes, unlike basic round-robin and priority-based arbitration. The proposed approach first estimates the effective service time of the packets in the queue due to WRR arbitration. Then, it uses the effective service time to compute the average waiting time of the packets. Next, we incorporate a decomposition technique to extend the analytical model to handle NoC of any size. The proposed approach achieves less than 5% error while executing real applications and 10% error under challenging synthetic traffic with different burstiness levels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.09534v2-abstract-full').style.display = 'none'; document.getElementById('2108.09534v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted in International Conference on Computer Aided Design (ICCAD), 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.08903">arXiv:2108.08903</a> <span> [<a href="https://arxiv.org/pdf/2108.08903">pdf</a>, <a href="https://arxiv.org/format/2108.08903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3476999">10.1145/3476999 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SIAM: Chiplet-based Scalable In-Memory Acceleration with Mesh for Deep Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Krishnan%2C+G">Gokul Krishnan</a>, <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Pannala%2C+M">Manvitha Pannala</a>, <a href="/search/cs?searchtype=author&query=Chakrabarti%2C+C">Chaitali Chakrabarti</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+J">Jae-sun Seo</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yu Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.08903v1-abstract-short" style="display: inline;"> In-memory computing (IMC) on a monolithic chip for deep learning faces dramatic challenges on area, yield, and on-chip interconnection cost due to the ever-increasing model sizes. 2.5D integration or chiplet-based architectures interconnect multiple small chips (i.e., chiplets) to form a large computing system, presenting a feasible solution beyond a monolithic IMC architecture to accelerate large… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.08903v1-abstract-full').style.display = 'inline'; document.getElementById('2108.08903v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.08903v1-abstract-full" style="display: none;"> In-memory computing (IMC) on a monolithic chip for deep learning faces dramatic challenges on area, yield, and on-chip interconnection cost due to the ever-increasing model sizes. 2.5D integration or chiplet-based architectures interconnect multiple small chips (i.e., chiplets) to form a large computing system, presenting a feasible solution beyond a monolithic IMC architecture to accelerate large deep learning models. This paper presents a new benchmarking simulator, SIAM, to evaluate the performance of chiplet-based IMC architectures and explore the potential of such a paradigm shift in IMC architecture design. SIAM integrates device, circuit, architecture, network-on-chip (NoC), network-on-package (NoP), and DRAM access models to realize an end-to-end system. SIAM is scalable in its support of a wide range of deep neural networks (DNNs), customizable to various network structures and configurations, and capable of efficient design space exploration. We demonstrate the flexibility, scalability, and simulation speed of SIAM by benchmarking different state-of-the-art DNNs with CIFAR-10, CIFAR-100, and ImageNet datasets. We further calibrate the simulation results with a published silicon result, SIMBA. The chiplet-based IMC architecture obtained through SIAM shows 130$\times$ and 72$\times$ improvement in energy-efficiency for ResNet-50 on the ImageNet dataset compared to Nvidia V100 and T4 GPUs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.08903v1-abstract-full').style.display = 'none'; document.getElementById('2108.08903v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.00568">arXiv:2108.00568</a> <span> [<a href="https://arxiv.org/pdf/2108.00568">pdf</a>, <a href="https://arxiv.org/format/2108.00568">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FLASH: Fast Neural Architecture Search with Hardware Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Guihong Li</a>, <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a>, <a href="/search/cs?searchtype=author&query=Marculescu%2C+R">Radu Marculescu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.00568v1-abstract-short" style="display: inline;"> Neural architecture search (NAS) is a promising technique to design efficient and high-performance deep neural networks (DNNs). As the performance requirements of ML applications grow continuously, the hardware accelerators start playing a central role in DNN design. This trend makes NAS even more complicated and time-consuming for most real applications. This paper proposes FLASH, a very fast NAS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.00568v1-abstract-full').style.display = 'inline'; document.getElementById('2108.00568v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.00568v1-abstract-full" style="display: none;"> Neural architecture search (NAS) is a promising technique to design efficient and high-performance deep neural networks (DNNs). As the performance requirements of ML applications grow continuously, the hardware accelerators start playing a central role in DNN design. This trend makes NAS even more complicated and time-consuming for most real applications. This paper proposes FLASH, a very fast NAS methodology that co-optimizes the DNN accuracy and performance on a real hardware platform. As the main theoretical contribution, we first propose the NN-Degree, an analytical metric to quantify the topological characteristics of DNNs with skip connections (e.g., DenseNets, ResNets, Wide-ResNets, and MobileNets). The newly proposed NN-Degree allows us to do training-free NAS within one second and build an accuracy predictor by training as few as 25 samples out of a vast search space with more than 63 billion configurations. Second, by performing inference on the target hardware, we fine-tune and validate our analytical models to estimate the latency, area, and energy consumption of various DNN architectures while executing standard ML datasets. Third, we construct a hierarchical algorithm based on simplicial homology global optimization (SHGO) to optimize the model-architecture co-design process, while considering the area, latency, and energy consumption of the target hardware. We demonstrate that, compared to the state-of-the-art NAS approaches, our proposed hierarchical SHGO-based algorithm enables more than four orders of magnitude speedup (specifically, the execution time of the proposed algorithm is about 0.1 seconds). Finally, our experimental evaluations show that FLASH is easily transferable to different hardware architectures, thus enabling us to do NAS on a Raspberry Pi-3B processor in less than 3 seconds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.00568v1-abstract-full').style.display = 'none'; document.getElementById('2108.00568v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at ACM CODES+ISSS 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.02358">arXiv:2107.02358</a> <span> [<a href="https://arxiv.org/pdf/2107.02358">pdf</a>, <a href="https://arxiv.org/format/2107.02358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3460233">10.1145/3460233 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Impact of On-Chip Interconnect on In-Memory Acceleration of Deep Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Krishnan%2C+G">Gokul Krishnan</a>, <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Chakrabarti%2C+C">Chaitali Chakrabarti</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+J">Jae-sun Seo</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yu Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.02358v1-abstract-short" style="display: inline;"> With the widespread use of Deep Neural Networks (DNNs), machine learning algorithms have evolved in two diverse directions -- one with ever-increasing connection density for better accuracy and the other with more compact sizing for energy efficiency. The increase in connection density increases on-chip data movement, which makes efficient on-chip communication a critical function of the DNN accel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.02358v1-abstract-full').style.display = 'inline'; document.getElementById('2107.02358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.02358v1-abstract-full" style="display: none;"> With the widespread use of Deep Neural Networks (DNNs), machine learning algorithms have evolved in two diverse directions -- one with ever-increasing connection density for better accuracy and the other with more compact sizing for energy efficiency. The increase in connection density increases on-chip data movement, which makes efficient on-chip communication a critical function of the DNN accelerator. The contribution of this work is threefold. First, we illustrate that the point-to-point (P2P)-based interconnect is incapable of handling a high volume of on-chip data movement for DNNs. Second, we evaluate P2P and network-on-chip (NoC) interconnect (with a regular topology such as a mesh) for SRAM- and ReRAM-based in-memory computing (IMC) architectures for a range of DNNs. This analysis shows the necessity for the optimal interconnect choice for an IMC DNN accelerator. Finally, we perform an experimental evaluation for different DNNs to empirically obtain the performance of the IMC architecture with both NoC-tree and NoC-mesh. We conclude that, at the tile level, NoC-tree is appropriate for compact DNNs employed at the edge, and NoC-mesh is necessary to accelerate DNNs with high connection density. Furthermore, we propose a technique to determine the optimal choice of interconnect for any given DNN. In this technique, we use analytical models of NoC to evaluate end-to-end communication latency of any given DNN. We demonstrate that the interconnect optimization in the IMC architecture results in up to 6$\times$ improvement in energy-delay-area product for VGG-19 inference compared to the state-of-the-art ReRAM-based IMC architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.02358v1-abstract-full').style.display = 'none'; document.getElementById('2107.02358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.06709">arXiv:2103.06709</a> <span> [<a href="https://arxiv.org/pdf/2103.06709">pdf</a>, <a href="https://arxiv.org/format/2103.06709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Hypervector Design for Efficient Hyperdimensional Computing on Edge Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Basaklar%2C+T">Toygun Basaklar</a>, <a href="/search/cs?searchtype=author&query=Tuncel%2C+Y">Yigit Tuncel</a>, <a href="/search/cs?searchtype=author&query=Narayana%2C+S+Y">Shruti Yadav Narayana</a>, <a href="/search/cs?searchtype=author&query=Gumussoy%2C+S">Suat Gumussoy</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.06709v1-abstract-short" style="display: inline;"> Hyperdimensional computing (HDC) has emerged as a new light-weight learning algorithm with smaller computation and energy requirements compared to conventional techniques. In HDC, data points are represented by high-dimensional vectors (hypervectors), which are mapped to high-dimensional space (hyperspace). Typically, a large hypervector dimension ($\geq1000$) is required to achieve accuracies com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.06709v1-abstract-full').style.display = 'inline'; document.getElementById('2103.06709v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.06709v1-abstract-full" style="display: none;"> Hyperdimensional computing (HDC) has emerged as a new light-weight learning algorithm with smaller computation and energy requirements compared to conventional techniques. In HDC, data points are represented by high-dimensional vectors (hypervectors), which are mapped to high-dimensional space (hyperspace). Typically, a large hypervector dimension ($\geq1000$) is required to achieve accuracies comparable to conventional alternatives. However, unnecessarily large hypervectors increase hardware and energy costs, which can undermine their benefits. This paper presents a technique to minimize the hypervector dimension while maintaining the accuracy and improving the robustness of the classifier. To this end, we formulate the hypervector design as a multi-objective optimization problem for the first time in the literature. The proposed approach decreases the hypervector dimension by more than $32\times$ while maintaining or increasing the accuracy achieved by conventional HDC. Experiments on a commercial hardware platform show that the proposed approach achieves more than one order of magnitude reduction in model size, inference time, and energy consumption. We also demonstrate the trade-off between accuracy and robustness to noise and provide Pareto front solutions as a design parameter in our hypervector design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.06709v1-abstract-full').style.display = 'none'; document.getElementById('2103.06709v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures, accepted to tinyML 2021 Research Symposium</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.13605">arXiv:2102.13605</a> <span> [<a href="https://arxiv.org/pdf/2102.13605">pdf</a>, <a href="https://arxiv.org/format/2102.13605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ECO: Enabling Energy-Neutral IoT Devices through Runtime Allocation of Harvested Energy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tuncel%2C+Y">Yigit Tuncel</a>, <a href="/search/cs?searchtype=author&query=Bhat%2C+G">Ganapati Bhat</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaehyun Park</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U">Umit Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.13605v2-abstract-short" style="display: inline;"> Energy harvesting offers an attractive and promising mechanism to power low-energy devices. However, it alone is insufficient to enable an energy-neutral operation, which can eliminate tedious battery charging and replacement requirements. Achieving an energy-neutral operation is challenging since the uncertainties in harvested energy undermine the quality of service requirements. To address this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.13605v2-abstract-full').style.display = 'inline'; document.getElementById('2102.13605v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.13605v2-abstract-full" style="display: none;"> Energy harvesting offers an attractive and promising mechanism to power low-energy devices. However, it alone is insufficient to enable an energy-neutral operation, which can eliminate tedious battery charging and replacement requirements. Achieving an energy-neutral operation is challenging since the uncertainties in harvested energy undermine the quality of service requirements. To address this challenge, we present a runtime energy-allocation framework that optimizes the utility of the target device under energy constraints using a rollout algorithm, which is a sequential approach to solve dynamic optimization problems. The proposed framework uses an efficient iterative algorithm to compute initial energy allocations at the beginning of a day. The initial allocations are then corrected at every interval to compensate for the deviations from the expected energy harvesting pattern. We evaluate this framework using solar and motion energy harvesting modalities and American Time Use Survey data from 4772 different users. Compared to prior techniques, the proposed framework achieves up to 35% higher utility even under energy-limited scenarios. Moreover, measurements on a wearable device prototype show that the proposed framework has 1000x smaller energy overhead than iterative approaches with a negligible loss in utility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.13605v2-abstract-full').style.display = 'none'; document.getElementById('2102.13605v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.04479">arXiv:2012.04479</a> <span> [<a href="https://arxiv.org/pdf/2012.04479">pdf</a>, <a href="https://arxiv.org/format/2012.04479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Transfer Learning for Human Activity Recognition using Representational Analysis of Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+S">Sizhe An</a>, <a href="/search/cs?searchtype=author&query=Bhat%2C+G">Ganapati Bhat</a>, <a href="/search/cs?searchtype=author&query=Gumussoy%2C+S">Suat Gumussoy</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U">Umit Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.04479v2-abstract-short" style="display: inline;"> Human activity recognition (HAR) research has increased in recent years due to its applications in mobile health monitoring, activity recognition, and patient rehabilitation. The typical approach is training a HAR classifier offline with known users and then using the same classifier for new users. However, the accuracy for new users can be low with this approach if their activity patterns are dif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.04479v2-abstract-full').style.display = 'inline'; document.getElementById('2012.04479v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.04479v2-abstract-full" style="display: none;"> Human activity recognition (HAR) research has increased in recent years due to its applications in mobile health monitoring, activity recognition, and patient rehabilitation. The typical approach is training a HAR classifier offline with known users and then using the same classifier for new users. However, the accuracy for new users can be low with this approach if their activity patterns are different than those in the training data. At the same time, training from scratch for new users is not feasible for mobile applications due to the high computational cost and training time. To address this issue, we propose a HAR transfer learning framework with two components. First, a representational analysis reveals common features that can transfer across users and user-specific features that need to be customized. Using this insight, we transfer the reusable portion of the offline classifier to new users and fine-tune only the rest. Our experiments with five datasets show up to 43% accuracy improvement and 66% training time reduction when compared to the baseline without using transfer learning. Furthermore, measurements on the Nvidia Jetson Xavier-NX hardware platform reveal that the power and energy consumption decrease by 43% and 68%, respectively, while achieving the same or higher accuracy as training from scratch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.04479v2-abstract-full').style.display = 'none'; document.getElementById('2012.04479v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.09728">arXiv:2008.09728</a> <span> [<a href="https://arxiv.org/pdf/2008.09728">pdf</a>, <a href="https://arxiv.org/format/2008.09728">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Online Adaptive Learning for Runtime Resource Management of Heterogeneous SoCs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a>, <a href="/search/cs?searchtype=author&query=Doppa%2C+J+R">Janardhan Rao Doppa</a>, <a href="/search/cs?searchtype=author&query=Ayoub%2C+R+Z">Raid Z. Ayoub</a>, <a href="/search/cs?searchtype=author&query=Kishinevsky%2C+M">Michael Kishinevsky</a>, <a href="/search/cs?searchtype=author&query=Pande%2C+P+P">Partha P. Pande</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.09728v1-abstract-short" style="display: inline;"> Dynamic resource management has become one of the major areas of research in modern computer and communication system design due to lower power consumption and higher performance demands. The number of integrated cores, level of heterogeneity and amount of control knobs increase steadily. As a result, the system complexity is increasing faster than our ability to optimize and dynamically manage th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.09728v1-abstract-full').style.display = 'inline'; document.getElementById('2008.09728v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.09728v1-abstract-full" style="display: none;"> Dynamic resource management has become one of the major areas of research in modern computer and communication system design due to lower power consumption and higher performance demands. The number of integrated cores, level of heterogeneity and amount of control knobs increase steadily. As a result, the system complexity is increasing faster than our ability to optimize and dynamically manage the resources. Moreover, offline approaches are sub-optimal due to workload variations and large volume of new applications unknown at design time. This paper first reviews recent online learning techniques for predicting system performance, power, and temperature. Then, we describe the use of predictive models for online control using two modern approaches: imitation learning (IL) and an explicit nonlinear model predictive control (NMPC). Evaluations on a commercial mobile platform with 16 benchmarks show that the IL approach successfully adapts the control policy to unknown applications. The explicit NMPC provides 25% energy savings compared to a state-of-the-art algorithm for multi-variable power management of modern GPU sub-systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.09728v1-abstract-full').style.display = 'none'; document.getElementById('2008.09728v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper appeared in the Proceedings of Design Automation Conference 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.03904">arXiv:2008.03904</a> <span> [<a href="https://arxiv.org/pdf/2008.03904">pdf</a>, <a href="https://arxiv.org/format/2008.03904">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Performance Analysis of Priority-Aware NoCs with Deflection Routing under Traffic Congestion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Krishnakumar%2C+A">Anish Krishnakumar</a>, <a href="/search/cs?searchtype=author&query=Ayoub%2C+R">Raid Ayoub</a>, <a href="/search/cs?searchtype=author&query=Kishinevsky%2C+M">Michael Kishinevsky</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.03904v3-abstract-short" style="display: inline;"> Priority-aware networks-on-chip (NoCs) are used in industry to achieve predictable latency under different workload conditions. These NoCs incorporate deflection routing to minimize queuing resources within routers and achieve low latency during low traffic load. However, deflected packets can exacerbate congestion during high traffic load since they consume the NoC bandwidth. State-of-the-art ana… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.03904v3-abstract-full').style.display = 'inline'; document.getElementById('2008.03904v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.03904v3-abstract-full" style="display: none;"> Priority-aware networks-on-chip (NoCs) are used in industry to achieve predictable latency under different workload conditions. These NoCs incorporate deflection routing to minimize queuing resources within routers and achieve low latency during low traffic load. However, deflected packets can exacerbate congestion during high traffic load since they consume the NoC bandwidth. State-of-the-art analytical models for priority-aware NoCs ignore deflected traffic despite its significant latency impact during congestion. This paper proposes a novel analytical approach to estimate end-to-end latency of priority-aware NoCs with deflection routing under bursty and heavy traffic scenarios. Experimental evaluations show that the proposed technique outperforms alternative approaches and estimates the average latency for real applications with less than 8% error compared to cycle-accurate simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.03904v3-abstract-full').style.display = 'none'; document.getElementById('2008.03904v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This article is in the Proceedings of ICCAD 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.13951">arXiv:2007.13951</a> <span> [<a href="https://arxiv.org/pdf/2007.13951">pdf</a>, <a href="https://arxiv.org/format/2007.13951">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Analytical Performance Modeling of NoCs under Priority Arbitration and Bursty Traffic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Ayoub%2C+R">Raid Ayoub</a>, <a href="/search/cs?searchtype=author&query=Kishinevsky%2C+M">Michael Kishinevsky</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+M+M">Mohammad M. Islam</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.13951v1-abstract-short" style="display: inline;"> Networks-on-Chip (NoCs) used in commercial many-core processors typically incorporate priority arbitration. Moreover, they experience bursty traffic due to application workloads. However, most state-of-the-art NoC analytical performance analysis techniques assume fair arbitration and simple traffic models. To address these limitations, we propose an analytical modeling technique for priority-aware… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.13951v1-abstract-full').style.display = 'inline'; document.getElementById('2007.13951v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.13951v1-abstract-full" style="display: none;"> Networks-on-Chip (NoCs) used in commercial many-core processors typically incorporate priority arbitration. Moreover, they experience bursty traffic due to application workloads. However, most state-of-the-art NoC analytical performance analysis techniques assume fair arbitration and simple traffic models. To address these limitations, we propose an analytical modeling technique for priority-aware NoCs under bursty traffic. Experimental evaluations with synthetic and bursty traffic show that the proposed approach has less than 10% modeling error with respect to cycle-accurate NoC simulator. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.13951v1-abstract-full').style.display = 'none'; document.getElementById('2007.13951v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper will appear in a future issue of IEEE Embedded Systems Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.09361">arXiv:2007.09361</a> <span> [<a href="https://arxiv.org/pdf/2007.09361">pdf</a>, <a href="https://arxiv.org/format/2007.09361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCAD.2020.3012861">10.1109/TCAD.2020.3012861 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Runtime Task Scheduling using Imitation Learning for Heterogeneous Many-Core Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Krishnakumar%2C+A">Anish Krishnakumar</a>, <a href="/search/cs?searchtype=author&query=Arda%2C+S+E">Samet E. Arda</a>, <a href="/search/cs?searchtype=author&query=Goksoy%2C+A+A">A. Alper Goksoy</a>, <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a>, <a href="/search/cs?searchtype=author&query=Sartor%2C+A+L">Anderson L. Sartor</a>, <a href="/search/cs?searchtype=author&query=Marculescu%2C+R">Radu Marculescu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.09361v2-abstract-short" style="display: inline;"> Domain-specific systems-on-chip, a class of heterogeneous many-core systems, are recognized as a key approach to narrow down the performance and energy-efficiency gap between custom hardware accelerators and programmable processors. Reaching the full potential of these architectures depends critically on optimally scheduling the applications to available resources at runtime. Existing optimization… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.09361v2-abstract-full').style.display = 'inline'; document.getElementById('2007.09361v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.09361v2-abstract-full" style="display: none;"> Domain-specific systems-on-chip, a class of heterogeneous many-core systems, are recognized as a key approach to narrow down the performance and energy-efficiency gap between custom hardware accelerators and programmable processors. Reaching the full potential of these architectures depends critically on optimally scheduling the applications to available resources at runtime. Existing optimization-based techniques cannot achieve this objective at runtime due to the combinatorial nature of the task scheduling problem. As the main theoretical contribution, this paper poses scheduling as a classification problem and proposes a hierarchical imitation learning (IL)-based scheduler that learns from an Oracle to maximize the performance of multiple domain-specific applications. Extensive evaluations with six streaming applications from wireless communications and radar domains show that the proposed IL-based scheduler approximates an offline Oracle policy with more than 99% accuracy for performance- and energy-based optimization objectives. Furthermore, it achieves almost identical performance to the Oracle with a low runtime overhead and successfully adapts to new applications, many-core system configurations, and runtime variations in application characteristics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.09361v2-abstract-full').style.display = 'none'; document.getElementById('2007.09361v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 12 figures, 8 tables. Accepted for publication in Embedded Systems Week CODES+ISSS 2020 (Special Issue in IEEE TCAD)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.01636">arXiv:2004.01636</a> <span> [<a href="https://arxiv.org/pdf/2004.01636">pdf</a>, <a href="https://arxiv.org/format/2004.01636">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> User-Space Emulation Framework for Domain-Specific SoC Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mack%2C+J">Joshua Mack</a>, <a href="/search/cs?searchtype=author&query=Kumbhare%2C+N">Nirmal Kumbhare</a>, <a href="/search/cs?searchtype=author&query=NK%2C+A">Anish NK</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a>, <a href="/search/cs?searchtype=author&query=Akoglu%2C+A">Ali Akoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.01636v2-abstract-short" style="display: inline;"> In this work, we propose a portable, Linux-based emulation framework to provide an ecosystem for hardware-software co-design of Domain-specific SoCs (DSSoCs) and enable their rapid evaluation during the pre-silicon design phase. This framework holistically targets three key challenges of DSSoC design: accelerator integration, resource management, and application development. We address these chall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.01636v2-abstract-full').style.display = 'inline'; document.getElementById('2004.01636v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.01636v2-abstract-full" style="display: none;"> In this work, we propose a portable, Linux-based emulation framework to provide an ecosystem for hardware-software co-design of Domain-specific SoCs (DSSoCs) and enable their rapid evaluation during the pre-silicon design phase. This framework holistically targets three key challenges of DSSoC design: accelerator integration, resource management, and application development. We address these challenges via a flexible and lightweight user-space runtime environment that enables easy integration of new accelerators, scheduling heuristics, and user applications, and we illustrate the utility of each through various case studies. With signal processing (WiFi and RADAR) as the target domain, we use our framework to evaluate the performance of various dynamic workloads on hypothetical DSSoC hardware configurations composed of mixtures of CPU cores and FFT accelerators using a Zynq UltraScale+TM MPSoC. We show the portability of this framework by conducting a similar study on an Odroid platform composed of big.LITTLE ARM clusters. Finally, we introduce a prototype compilation toolchain that enables automatic mapping of unlabeled C code to DSSoC platforms. Taken together, this environment offers a unique ecosystem to rapidly perform functional verification and obtain performance and utilization estimates that help accelerate convergence towards a final DSSoC design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.01636v2-abstract-full').style.display = 'none'; document.getElementById('2004.01636v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 11 figures. To be published in proceedings of 2020 Heterogeneity in Computing Workshop http://hcw.oucreate.com/ held in conjunction with IPDPS 2020 http://www.ipdps.org/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.09526">arXiv:2003.09526</a> <span> [<a href="https://arxiv.org/pdf/2003.09526">pdf</a>, <a href="https://arxiv.org/format/2003.09526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> An Energy-Aware Online Learning Framework for Resource Management in Heterogeneous Platforms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Bhat%2C+G">Ganapati Bhat</a>, <a href="/search/cs?searchtype=author&query=Doppa%2C+J+R">Janardhan Rao Doppa</a>, <a href="/search/cs?searchtype=author&query=Pande%2C+P+P">Partha Pratim Pande</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.09526v1-abstract-short" style="display: inline;"> Mobile platforms must satisfy the contradictory requirements of fast response time and minimum energy consumption as a function of dynamically changing applications. To address this need, system-on-chips (SoC) that are at the heart of these devices provide a variety of control knobs, such as the number of active cores and their voltage/frequency levels. Controlling these knobs optimally at runtime… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.09526v1-abstract-full').style.display = 'inline'; document.getElementById('2003.09526v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.09526v1-abstract-full" style="display: none;"> Mobile platforms must satisfy the contradictory requirements of fast response time and minimum energy consumption as a function of dynamically changing applications. To address this need, system-on-chips (SoC) that are at the heart of these devices provide a variety of control knobs, such as the number of active cores and their voltage/frequency levels. Controlling these knobs optimally at runtime is challenging for two reasons. First, the large configuration space prohibits exhaustive solutions. Second, control policies designed offline are at best sub-optimal since many potential new applications are unknown at design-time. We address these challenges by proposing an online imitation learning approach. Our key idea is to construct an offline policy and adapt it online to new applications to optimize a given metric (e.g., energy). The proposed methodology leverages the supervision enabled by power-performance models learned at runtime. We demonstrate its effectiveness on a commercial mobile platform with 16 diverse benchmarks. Our approach successfully adapts the control policy to an unknown application after executing less than 25% of its instructions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.09526v1-abstract-full').style.display = 'none'; document.getElementById('2003.09526v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted to be published in a future issue of ACM TODAES</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.09016">arXiv:2003.09016</a> <span> [<a href="https://arxiv.org/pdf/2003.09016">pdf</a>, <a href="https://arxiv.org/format/2003.09016">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> DS3: A System-Level Domain-Specific System-on-Chip Simulation Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Arda%2C+S+E">Samet E. Arda</a>, <a href="/search/cs?searchtype=author&query=NK%2C+A">Anish NK</a>, <a href="/search/cs?searchtype=author&query=Goksoy%2C+A+A">A. Alper Goksoy</a>, <a href="/search/cs?searchtype=author&query=Kumbhare%2C+N">Nirmal Kumbhare</a>, <a href="/search/cs?searchtype=author&query=Mack%2C+J">Joshua Mack</a>, <a href="/search/cs?searchtype=author&query=Sartor%2C+A+L">Anderson L. Sartor</a>, <a href="/search/cs?searchtype=author&query=Akoglu%2C+A">Ali Akoglu</a>, <a href="/search/cs?searchtype=author&query=Marculescu%2C+R">Radu Marculescu</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.09016v1-abstract-short" style="display: inline;"> Heterogeneous systems-on-chip (SoCs) are highly favorable computing platforms due to their superior performance and energy efficiency potential compared to homogeneous architectures. They can be further tailored to a specific domain of applications by incorporating processing elements (PEs) that accelerate frequently used kernels in these applications. However, this potential is contingent upon op… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.09016v1-abstract-full').style.display = 'inline'; document.getElementById('2003.09016v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.09016v1-abstract-full" style="display: none;"> Heterogeneous systems-on-chip (SoCs) are highly favorable computing platforms due to their superior performance and energy efficiency potential compared to homogeneous architectures. They can be further tailored to a specific domain of applications by incorporating processing elements (PEs) that accelerate frequently used kernels in these applications. However, this potential is contingent upon optimizing the SoC for the target domain and utilizing its resources effectively at runtime. To this end, system-level design - including scheduling, power-thermal management algorithms and design space exploration studies - plays a crucial role. This paper presents a system-level domain-specific SoC simulation (DS3) framework to address this need. DS3 enables both design space exploration and dynamic resource management for power-performance optimization of domain applications. We showcase DS3 using six real-world applications from wireless communications and radar processing domain. DS3, as well as the reference applications, is shared as open-source software to stimulate research in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.09016v1-abstract-full').style.display = 'none'; document.getElementById('2003.09016v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 20 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.03664">arXiv:1908.03664</a> <span> [<a href="https://arxiv.org/pdf/1908.03664">pdf</a>, <a href="https://arxiv.org/format/1908.03664">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Work-in-Progress: A Simulation Framework for Domain-Specific System-on-Chips </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Arda%2C+S+E">Samet E. Arda</a>, <a href="/search/cs?searchtype=author&query=NK%2C+A">Anish NK</a>, <a href="/search/cs?searchtype=author&query=Goksoy%2C+A+A">A. Alper Goksoy</a>, <a href="/search/cs?searchtype=author&query=Mack%2C+J">Joshua Mack</a>, <a href="/search/cs?searchtype=author&query=Kumbhare%2C+N">Nirmal Kumbhare</a>, <a href="/search/cs?searchtype=author&query=Sartor%2C+A+L">Anderson L. Sartor</a>, <a href="/search/cs?searchtype=author&query=Akoglu%2C+A">Ali Akoglu</a>, <a href="/search/cs?searchtype=author&query=Marculescu%2C+R">Radu Marculescu</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.03664v1-abstract-short" style="display: inline;"> Heterogeneous system-on-chips (SoCs) have become the standard embedded computing platforms due to their potential to deliver superior performance and energy efficiency compared to homogeneous architectures. They can be particularly suited to target a specific domain of applications. However, this potential is contingent upon optimizing the SoC for the target domain and utilizing its resources effe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.03664v1-abstract-full').style.display = 'inline'; document.getElementById('1908.03664v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.03664v1-abstract-full" style="display: none;"> Heterogeneous system-on-chips (SoCs) have become the standard embedded computing platforms due to their potential to deliver superior performance and energy efficiency compared to homogeneous architectures. They can be particularly suited to target a specific domain of applications. However, this potential is contingent upon optimizing the SoC for the target domain and utilizing its resources effectively at run-time. Cycle-accurate instruction set simulators are not suitable for this optimization, since meaningful temperature and power consumption evaluations require simulating seconds, if not minutes, of workloads. This paper presents a system-level domain-specific SoC simulation (DS3) framework to address this need. DS3 enables both design space exploration and dynamic resource management for power-performance optimization for domain applications with$~600\times$ speedup compared to commonly used gem5 simulator. We showcase DS3 using five applications from wireless communications and radar processing domain. DS3, as well as the reference applications, will be shared as open-source software to stimulate research in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.03664v1-abstract-full').style.display = 'none'; document.getElementById('1908.03664v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.02408">arXiv:1908.02408</a> <span> [<a href="https://arxiv.org/pdf/1908.02408">pdf</a>, <a href="https://arxiv.org/format/1908.02408">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Analytical Performance Models for NoCs with Multiple Priority Traffic Classes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mandal%2C+S+K">Sumit K. Mandal</a>, <a href="/search/cs?searchtype=author&query=Ayoub%2C+R">Raid Ayoub</a>, <a href="/search/cs?searchtype=author&query=Kishinevsky%2C+M">Michael Kishinevsky</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.02408v2-abstract-short" style="display: inline;"> Networks-on-chip (NoCs) have become the standard for interconnect solutions in industrial designs ranging from client CPUs to many-core chip-multiprocessors. Since NoCs play a vital role in system performance and power consumption, pre-silicon evaluation environments include cycle-accurate NoC simulators. Long simulations increase the execution time of evaluation frameworks, which are already noto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.02408v2-abstract-full').style.display = 'inline'; document.getElementById('1908.02408v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.02408v2-abstract-full" style="display: none;"> Networks-on-chip (NoCs) have become the standard for interconnect solutions in industrial designs ranging from client CPUs to many-core chip-multiprocessors. Since NoCs play a vital role in system performance and power consumption, pre-silicon evaluation environments include cycle-accurate NoC simulators. Long simulations increase the execution time of evaluation frameworks, which are already notoriously slow, and prohibit design-space exploration. Existing analytical NoC models, which assume fair arbitration, cannot replace these simulations since industrial NoCs typically employ priority schedulers and multiple priority classes. To address this limitation, we propose a systematic approach to construct priority-aware analytical performance models using micro-architecture specifications and input traffic. Our approach consists of developing two novel transformations of queuing system and designing an algorithm which iteratively uses these two transformations to estimate end-to-end latency. Our approach decomposes the given NoC into individual queues with modified service time to enable accurate and scalable latency computations. Specifically, we introduce novel transformations along with an algorithm that iteratively applies these transformations to decompose the queuing system. Experimental evaluations using real architectures and applications show high accuracy of 97% and up to 2.5x speedup in full-system simulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.02408v2-abstract-full').style.display = 'none'; document.getElementById('1908.02408v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This article will appear as part of the ESWEEK-TECS special issue and will be presented in the International Conference on Compilers, Architecture, and Synthesis for Embedded Systems (CASES) 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.09814">arXiv:1904.09814</a> <span> [<a href="https://arxiv.org/pdf/1904.09814">pdf</a>, <a href="https://arxiv.org/format/1904.09814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Other Computer Science">cs.OH</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Power and Thermal Analysis of Commercial Mobile Platforms: Experiments and Case Studies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhat%2C+G">Ganapati Bhat</a>, <a href="/search/cs?searchtype=author&query=Gumussoy%2C+S">Suat Gumussoy</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.09814v1-abstract-short" style="display: inline;"> State-of-the-art mobile processors can deliver fast response time and high throughput to maximize the user experience. However, high performance comes at the expense of larger power density, which leads to higher skin temperatures. Since this can degrade the user experience, there is a strong need for power consumption and thermal analysis in mobile processors. In this paper, we first perform expe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.09814v1-abstract-full').style.display = 'inline'; document.getElementById('1904.09814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.09814v1-abstract-full" style="display: none;"> State-of-the-art mobile processors can deliver fast response time and high throughput to maximize the user experience. However, high performance comes at the expense of larger power density, which leads to higher skin temperatures. Since this can degrade the user experience, there is a strong need for power consumption and thermal analysis in mobile processors. In this paper, we first perform experiments on the Nexus 6P phone to study the power, performance and thermal behavior of modern smartphones. Using the insight from these experiments, we propose a control algorithm that throttles select applications without affecting other apps. We demonstrate our governor on the Exynos 5422 processor employed in the Odroid-XU3 board. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.09814v1-abstract-full').style.display = 'none'; document.getElementById('1904.09814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in proceedings of IEEE DATE 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.03168">arXiv:1903.03168</a> <span> [<a href="https://arxiv.org/pdf/1903.03168">pdf</a>, <a href="https://arxiv.org/format/1903.03168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> OpenHealth: Open Source Platform for Wearable Health Monitoring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhat%2C+G">Ganapati Bhat</a>, <a href="/search/cs?searchtype=author&query=Deb%2C+R">Ranadeep Deb</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.03168v2-abstract-short" style="display: inline;"> Movement disorders are becoming one of the leading causes of functional disability due to aging populations and extended life expectancy. Wearable health monitoring is emerging as an effective way to augment clinical care for movement disorders. However, wearable devices face a number of adaptation and technical challenges that hinder their widespread adoption. To address these challenges, we intr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.03168v2-abstract-full').style.display = 'inline'; document.getElementById('1903.03168v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.03168v2-abstract-full" style="display: none;"> Movement disorders are becoming one of the leading causes of functional disability due to aging populations and extended life expectancy. Wearable health monitoring is emerging as an effective way to augment clinical care for movement disorders. However, wearable devices face a number of adaptation and technical challenges that hinder their widespread adoption. To address these challenges, we introduce OpenHealth, an open source platform for wearable health monitoring. OpenHealth aims to design a standard set of hardware/software and wearable devices that can enable autonomous collection of clinically relevant data. The OpenHealth platform includes a wearable device, standard software interfaces and reference implementations of human activity and gesture recognition applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.03168v2-abstract-full').style.display = 'none'; document.getElementById('1903.03168v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in a future issue of IEEE Design & Test</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1808.08615">arXiv:1808.08615</a> <span> [<a href="https://arxiv.org/pdf/1808.08615">pdf</a>, <a href="https://arxiv.org/format/1808.08615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3240765.3240833">10.1145/3240765.3240833 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Online Human Activity Recognition using Low-Power Wearable Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhat%2C+G">Ganapati Bhat</a>, <a href="/search/cs?searchtype=author&query=Deb%2C+R">Ranadeep Deb</a>, <a href="/search/cs?searchtype=author&query=Chaurasia%2C+V+V">Vatika Vardhan Chaurasia</a>, <a href="/search/cs?searchtype=author&query=Shill%2C+H">Holly Shill</a>, <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1808.08615v2-abstract-short" style="display: inline;"> Human activity recognition~(HAR) has attracted significant research interest due to its applications in health monitoring and patient rehabilitation. Recent research on HAR focuses on using smartphones due to their widespread use. However, this leads to inconvenient use, limited choice of sensors and inefficient use of resources, since smartphones are not designed for HAR. This paper presents the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.08615v2-abstract-full').style.display = 'inline'; document.getElementById('1808.08615v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1808.08615v2-abstract-full" style="display: none;"> Human activity recognition~(HAR) has attracted significant research interest due to its applications in health monitoring and patient rehabilitation. Recent research on HAR focuses on using smartphones due to their widespread use. However, this leads to inconvenient use, limited choice of sensors and inefficient use of resources, since smartphones are not designed for HAR. This paper presents the first HAR framework that can perform both online training and inference. The proposed framework starts with a novel technique that generates features using the fast Fourier and discrete wavelet transforms of a textile-based stretch sensor and accelerometer. Using these features, we design an artificial neural network classifier which is trained online using the policy gradient algorithm. Experiments on a low power IoT device (TI-CC2650 MCU) with nine users show 97.7% accuracy in identifying six activities and their transitions with less than 12.5 mW power consumption. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.08615v2-abstract-full').style.display = 'none'; document.getElementById('1808.08615v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is in proceedings of ICCAD 2018. The datasets are available at https://github.com/gmbhat/human-activity-recognition</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/0710.4707">arXiv:0710.4707</a> <span> [<a href="https://arxiv.org/pdf/0710.4707">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Energy- and Performance-Driven NoC Communication Architecture Synthesis Using a Decomposition Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ogras%2C+U+Y">Umit Y. Ogras</a>, <a href="/search/cs?searchtype=author&query=Marculescu%2C+R">Radu Marculescu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="0710.4707v1-abstract-short" style="display: inline;"> In this paper, we present a methodology for customized communication architecture synthesis that matches the communication requirements of the target application. This is an important problem, particularly for network-based implementations of complex applications. Our approach is based on using frequently encountered generic communication primitives as an alphabet capable of characterizing any g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('0710.4707v1-abstract-full').style.display = 'inline'; document.getElementById('0710.4707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="0710.4707v1-abstract-full" style="display: none;"> In this paper, we present a methodology for customized communication architecture synthesis that matches the communication requirements of the target application. This is an important problem, particularly for network-based implementations of complex applications. Our approach is based on using frequently encountered generic communication primitives as an alphabet capable of characterizing any given communication pattern. The proposed algorithm searches through the entire design space for a solution that minimizes the system total energy consumption, while satisfying the other design constraints. Compared to the standard mesh architecture, the customized architecture generated by the newly proposed approach shows about 36% throughput increase and 51% reduction in the energy required to encrypt 128 bits of data with a standard encryption algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('0710.4707v1-abstract-full').style.display = 'none'; document.getElementById('0710.4707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2007; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2007. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted on behalf of EDAA (http://www.edaa.com/)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Dans Design, Automation and Test in Europe - DATE'05, Munich : Allemagne (2005) </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>