CINXE.COM

Hardware Architecture

<!DOCTYPE html> <html lang="en"> <head> <title>Hardware Architecture </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>&gt;</span> <a href="/list/cs.AR/recent">cs.AR</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Hardware Architecture</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item20">Cross-lists</a></li> <li><a href="#item21">Replacements</a></li> </ul> <p>See <a id="recent-cs.AR" aria-labelledby="recent-cs.AR" href="/list/cs.AR/recent">recent</a> articles</p> <h3>Showing new listings for Tuesday, 18 March 2025</h3> <div class='paging'>Total of 25 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.AR/new?skip=0&amp;show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 19 of 19 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2503.11654" title="Abstract" id="2503.11654"> arXiv:2503.11654 </a> [<a href="/pdf/2503.11654" title="Download PDF" id="pdf-2503.11654" aria-labelledby="pdf-2503.11654">pdf</a>, <a href="https://arxiv.org/html/2503.11654v1" title="View HTML" id="html-2503.11654" aria-labelledby="html-2503.11654" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11654" title="Other formats" id="oth-2503.11654" aria-labelledby="oth-2503.11654">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhanced LPDDR4X PHY in 12 nm FinFET </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Feldmann,+J">Johannes Feldmann</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lappas,+J">Jan Lappas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Esmaeilpour,+M">Mohammadreza Esmaeilpour</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Abdo,+H">Hussien Abdo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Weis,+C">Christian Weis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wehn,+N">Norbert Wehn</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> RISC-V Summit Europe, Munich, 24-28th June 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span> </div> <p class='mathjax'> The demand for memory technologies with high bandwidth, low power consumption, and enhanced reliability has led to the emergence of LPDDR4X DRAM memory. However, power efficiency and reliability depend not only on the memory device but also on its interfacing. To enable advanced monitoring of LPDDR4X DRAM devices and interface tuning, we propose a LPDDR4X PHY implemented in 12 nm FinFET technology. A RISC-V subsystem offers software-controlled DRAM interface access as well as external interfaces to connect additional sensors for monitoring temperature and current consumption of LPDDR4X DRAM devices. </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2503.11658" title="Abstract" id="2503.11658"> arXiv:2503.11658 </a> [<a href="/pdf/2503.11658" title="Download PDF" id="pdf-2503.11658" aria-labelledby="pdf-2503.11658">pdf</a>, <a href="/format/2503.11658" title="Other formats" id="oth-2503.11658" aria-labelledby="oth-2503.11658">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Circuit Diagram Retrieval Based on Hierarchical Circuit Graph Representation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+M">Ming Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qiu,+R">Ruichen Qiu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chang,+Z+H">Zeng Hui Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+K">Kanjian Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wei,+H">Haikun Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+H+C">Hong Cai Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages, 10 figures, 7 tables, under review paper </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> In the domain of analog circuit design, the retrieval of circuit diagrams has drawn a great interest, primarily due to its vital role in the consultation of legacy designs and the detection of design plagiarism. Existing image retrieval techniques are adept at handling natural images, which converts images into feature vectors and retrieval similar images according to the closeness of these vectors. Nonetheless, these approaches exhibit limitations when applied to the more specialized and intricate domain of circuit diagrams. This paper presents a novel approach to circuit diagram retrieval by employing a graph representation of circuit diagrams, effectively reformulating the retrieval task as a graph retrieval problem. The proposed methodology consists of two principal components: a circuit diagram recognition algorithm designed to extract the circuit components and topological structure of the circuit using proposed GAM-YOLO model and a 2-step connected domain filtering algorithm, and a hierarchical retrieval strategy based on graph similarity and different graph representation methods for analog circuits. Our methodology pioneers the utilization of graph representation in the retrieval of circuit diagrams, incorporating topological features that are commonly overlooked by standard image retrieval methods. The results of our experiments substantiate the efficacy of our approach in retrieving circuit diagrams across of different types. </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2503.11660" title="Abstract" id="2503.11660"> arXiv:2503.11660 </a> [<a href="/pdf/2503.11660" title="Download PDF" id="pdf-2503.11660" aria-labelledby="pdf-2503.11660">pdf</a>, <a href="https://arxiv.org/html/2503.11660v1" title="View HTML" id="html-2503.11660" aria-labelledby="html-2503.11660" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11660" title="Other formats" id="oth-2503.11660" aria-labelledby="oth-2503.11660">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A 28 nm AI microcontroller with tightly coupled zero-standby power weight memory featuring standard logic compatible 4 Mb 4-bits/cell embedded flash technology </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kim,+D">Daewung Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jeon,+S+H">Seong Hwan Jeon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jeon,+Y+H">Young Hee Jeon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kwon,+K">Kyung-Bae Kwon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kim,+J">Jigon Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Choi,+Y">Yeounghun Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cha,+H">Hyunseung Cha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kwon,+K">Kitae Kwon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Park,+D">Daesik Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lee,+J">Jongseuk Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kim,+S">Sihwan Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Song,+S">Seung-Hwan Song</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6 pages, 8 figures, Accepted as a full paper by the 2025 EDGE AI FOUNDATION Austin </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This study introduces a novel AI microcontroller optimized for cost-effective, battery-powered edge AI applications. Unlike traditional single bit/cell memory configurations, the proposed microcontroller integrates zero-standby power weight memory featuring standard logic compatible 4-bits/cell embedded flash technology tightly coupled to a Near-Memory Computing Unit. This architecture enables efficient and low-power AI acceleration. Advanced state mapping and an overstress-free word line (WL) driver circuit extend verify levels, ensuring robust 16 state cell margin. A ping-pong buffer reduces internal data movement while supporting simultaneous multi-bit processing. The fabricated microcontroller demonstrated high reliability, maintaining accuracy after 160 hours of unpowered baking at 125$^\circ$C. </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2503.11662" title="Abstract" id="2503.11662"> arXiv:2503.11662 </a> [<a href="/pdf/2503.11662" title="Download PDF" id="pdf-2503.11662" aria-labelledby="pdf-2503.11662">pdf</a>, <a href="https://arxiv.org/html/2503.11662v1" title="View HTML" id="html-2503.11662" aria-labelledby="html-2503.11662" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11662" title="Other formats" id="oth-2503.11662" aria-labelledby="oth-2503.11662">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Lorecast: Layout-Aware Performance and Power Forecasting from Natural Language </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+R">Runzhi Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sengupta,+P">Prianka Sengupta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Y">Yiran Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+J">Jiang Hu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG) </div> <p class='mathjax'> In chip design planning, obtaining reliable performance and power forecasts for various design options is of critical importance. Traditionally, this involves using system-level models, which often lack accuracy, or trial synthesis, which is both labor-intensive and time-consuming. We introduce a new methodology, called Lorecast, which accepts English prompts as input to rapidly generate layout-aware performance and power estimates. This approach bypasses the need for HDL code development or synthesis, making it both fast and user-friendly. Experimental results demonstrate that Lorecast achieves accuracy within a few percent of error compared to post-layout analysis. </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2503.11663" title="Abstract" id="2503.11663"> arXiv:2503.11663 </a> [<a href="/pdf/2503.11663" title="Download PDF" id="pdf-2503.11663" aria-labelledby="pdf-2503.11663">pdf</a>, <a href="https://arxiv.org/html/2503.11663v1" title="View HTML" id="html-2503.11663" aria-labelledby="html-2503.11663" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11663" title="Other formats" id="oth-2503.11663" aria-labelledby="oth-2503.11663">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MEADOW: Memory-efficient Dataflow and Data Packing for Low Power Edge LLMs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Moitra,+A">Abhishek Moitra</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ghosh,+A">Arkapravo Ghosh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Agarwal,+S">Shrey Agarwal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Amarnath,+A">Aporva Amarnath</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Swaminathan,+K">Karthik Swaminathan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Panda,+P">Priyadarshini Panda</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 12 pages, 13 figures. Accepted to The Eighth Annual Conference on Machine Learning and Systems (MLSys), 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) </div> <p class='mathjax'> The computational and memory challenges of large language models (LLMs) have sparked several optimization approaches towards their efficient implementation. While prior LLM-targeted quantization, and prior works on sparse acceleration have significantly mitigated the memory and computation bottleneck, they do so assuming high power platforms such as GPUs and server-class FPGAs with large off-chip memory bandwidths and employ a generalized matrix multiplication (GEMM) execution of all the layers in the decoder. In such a GEMM-based execution, data is fetched from an off-chip memory, computed and stored back. However, at reduced off-chip memory capacities, as is the case with low-power edge devices, this implementation strategy significantly increases the attention computation latency owing to the repeated storage and fetch of large intermediate tokens to and from the off-chip memory. Moreover, fetching the weight matrices from a bandwidth constrained memory further aggravates the memory bottleneck problem. To this end, we introduce MEADOW, a framework that significantly reduces the off-chip memory access for LLMs with a novel token-parallel head-sequential (TPHS) dataflow. Additionally, MEADOW applies weight packing that performs loss-less decomposition of large weight matrices to their unique elements thereby, reducing the enormous weight fetch latency. MEADOW demonstrates 1.5x and 2.5x lower decode and prefill latency, respectively, compared to a GEMM-based LLM implementation on the low power Xilinx ZCU102 FPGA platform that consumes less than 10W. Additionally, MEADOW achieves an end-to-end latency improvement of over 40%, compared to prior LLM optimization works. </p> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2503.11665" title="Abstract" id="2503.11665"> arXiv:2503.11665 </a> [<a href="/pdf/2503.11665" title="Download PDF" id="pdf-2503.11665" aria-labelledby="pdf-2503.11665">pdf</a>, <a href="https://arxiv.org/html/2503.11665v1" title="View HTML" id="html-2503.11665" aria-labelledby="html-2503.11665" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11665" title="Other formats" id="oth-2503.11665" aria-labelledby="oth-2503.11665">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Efficient Flash Caches with Emerging NVMe Flexible Data Placement SSDs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Allison,+M">Michael Allison</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=George,+A">Arun George</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gonzalez,+J">Javier Gonzalez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Helmick,+D">Dan Helmick</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kumar,+V">Vikash Kumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nair,+R">Roshan Nair</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shah,+V">Vivek Shah</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This is a pre-print version of the paper that will appear at Eurosys 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Databases (cs.DB); Emerging Technologies (cs.ET) </div> <p class='mathjax'> NVMe Flash-based SSDs are widely deployed in data centers to cache working sets of large-scale web services. As data centers face increasing sustainability demands, such as reduced carbon emissions, efficient management of Flash overprovisioning and endurance has become crucial. Our analysis demonstrates that mixing data with different lifetimes on Flash blocks results in high device garbage collection costs, which either reduce device lifetime or necessitate host overprovisioning. Targeted data placement on Flash to minimize data intermixing and thus device write amplification shows promise for addressing this issue. <br>The NVMe Flexible Data Placement (FDP) proposal is a newly ratified technical proposal aimed at addressing data placement needs while reducing the software engineering costs associated with past storage interfaces, such as ZNS and Open-Channel SSDs. In this study, we explore the feasibility, benefits, and limitations of leveraging NVMe FDP primitives for data placement on Flash media in CacheLib, a popular open-source Flash cache widely deployed and used in Meta&#39;s software ecosystem as a caching building block. We demonstrate that targeted data placement in CacheLib using NVMe FDP SSDs helps reduce device write amplification, embodied carbon emissions, and power consumption with almost no overhead to other metrics. Using multiple production traces and their configurations from Meta and Twitter, we show that an ideal device write amplification of ~1 can be achieved with FDP, leading to improved SSD utilization and sustainable Flash cache deployments. </p> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2503.11666" title="Abstract" id="2503.11666"> arXiv:2503.11666 </a> [<a href="/pdf/2503.11666" title="Download PDF" id="pdf-2503.11666" aria-labelledby="pdf-2503.11666">pdf</a>, <a href="https://arxiv.org/html/2503.11666v1" title="View HTML" id="html-2503.11666" aria-labelledby="html-2503.11666" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11666" title="Other formats" id="oth-2503.11666" aria-labelledby="oth-2503.11666">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Optimizing Coverage-Driven Verification Using Machine Learning and PyUVM: A Novel Approach </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kumari,+S">Suruchi Kumari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gadde,+D+N">Deepak Narayan Gadde</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kumar,+A">Aman Kumar</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> To appear at 2025 IEEE International Symposium on Circuits and Systems, May 25-28 2025, London, United Kingdom </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The escalating complexity of System-on-Chip (SoC) designs has created a bottleneck in verification, with traditional techniques struggling to achieve complete coverage. Existing techniques, such as Constrained Random Verification (CRV) and coverage-driven methodologies, rely on time-consuming and redundant simulation regression, leading to higher verification costs and longer time-to-market due to the manual effort required to adjust constraints and drive the stimuli to achieve coverage objectives. To address this challenge, we propose a novel methodology that leverages supervised Machine Learning (ML) to optimize simulation regressions, resulting in reduced simulation run-time and the number of test simulations required to achieve target coverage goals. We also investigate and compare the effectiveness of various supervised learning algorithms from scikit-learn. Our results demonstrate that these algorithms can achieve at least 99% coverage regain with significantly reduced simulation cycles. We utilize Python Universal Verification Methodology (PyUVM) over SystemVerilog-Universal Verification Methodology (SV-UVM) for testbench creation, enabling simpler constructs using Python and facilitating the reuse of existing ML libraries. Our methodology is applied to three diverse designs, and our results show that it can significantly reduce verification costs, manual efforts, and time-to-market, while enhancing verification productivity and completeness, by automating the testbench update process and achieving target coverage goals. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2503.11674" title="Abstract" id="2503.11674"> arXiv:2503.11674 </a> [<a href="/pdf/2503.11674" title="Download PDF" id="pdf-2503.11674" aria-labelledby="pdf-2503.11674">pdf</a>, <a href="https://arxiv.org/html/2503.11674v1" title="View HTML" id="html-2503.11674" aria-labelledby="html-2503.11674" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11674" title="Other formats" id="oth-2503.11674" aria-labelledby="oth-2503.11674">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Timing-Driven Global Placement by Efficient Critical Path Extraction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shi,+Y">Yunqi Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+S">Siyuan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kai,+S">Shixiong Kai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+X">Xi Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xue,+K">Ke Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yuan,+M">Mingxuan Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qian,+C">Chao Qian</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by DATE&#39;25 as a Best Paper Award </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> Timing optimization during the global placement of integrated circuits has been a significant focus for decades, yet it remains a complex, unresolved issue. Recent analytical methods typically use pin-level timing information to adjust net weights, which is fast and simple but neglects the path-based nature of the timing graph. The existing path-based methods, however, cannot balance the accuracy and efficiency due to the exponential growth of number of critical paths. In this work, we propose a GPU-accelerated timing-driven global placement framework, integrating accurate path-level information into the efficient DREAMPlace infrastructure. It optimizes the fine-grained pin-to-pin attraction objective and is facilitated by efficient critical path extraction. We also design a quadratic distance loss function specifically to align with the RC timing model. Experimental results demonstrate that our method significantly outperforms the current leading timing-driven placers, achieving an average improvement of 40.5% in total negative slack (TNS) and 8.3% in worst negative slack (WNS), as well as an improvement in half-perimeter wirelength (HPWL). </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2503.11685" title="Abstract" id="2503.11685"> arXiv:2503.11685 </a> [<a href="/pdf/2503.11685" title="Download PDF" id="pdf-2503.11685" aria-labelledby="pdf-2503.11685">pdf</a>, <a href="/format/2503.11685" title="Other formats" id="oth-2503.11685" aria-labelledby="oth-2503.11685">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CORDIC Is All You Need </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kokane,+O">Omkar Kokane</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Teman,+A">Adam Teman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jha,+A">Anushka Jha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=SL,+G+P">Guru Prasath SL</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Raut,+G">Gopal Raut</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lokhande,+M">Mukul Lokhande</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chand,+S+V+J">S. V. Jaya Chand</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dewangan,+T">Tanushree Dewangan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Vishvakarma,+S+K">Santosh Kumar Vishvakarma</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Computer Vision and Pattern Recognition (cs.CV); Image and Video Processing (eess.IV) </div> <p class='mathjax'> Artificial intelligence necessitates adaptable hardware accelerators for efficient high-throughput million operations. We present pipelined architecture with CORDIC block for linear MAC computations and nonlinear iterative Activation Functions (AF) such as $tanh$, $sigmoid$, and $softmax$. This approach focuses on a Reconfigurable Processing Engine (RPE) based systolic array, with 40\% pruning rate, enhanced throughput up to 4.64$\times$, and reduction in power and area by 5.02 $\times$ and 4.06 $\times$ at CMOS 28 nm, with minor accuracy loss. FPGA implementation achieves a reduction of up to 2.5 $\times$ resource savings and 3 $\times$ power compared to prior works. The Systolic CORDIC engine for Reconfigurability and Enhanced throughput (SYCore) deploys an output stationary dataflow with the CAESAR control engine for diverse AI workloads such as Transformers, RNNs/LSTMs, and DNNs for applications like image detection, LLMs, and speech recognition. The energy-efficient and flexible approach extends the enhanced approach for edge AI accelerators supporting emerging workloads. </p> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2503.11687" title="Abstract" id="2503.11687"> arXiv:2503.11687 </a> [<a href="/pdf/2503.11687" title="Download PDF" id="pdf-2503.11687" aria-labelledby="pdf-2503.11687">pdf</a>, <a href="https://arxiv.org/html/2503.11687v1" title="View HTML" id="html-2503.11687" aria-labelledby="html-2503.11687" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11687" title="Other formats" id="oth-2503.11687" aria-labelledby="oth-2503.11687">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Review of Machine Learning for Micro-Electronic Design Verification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bennett,+C">Christopher Bennett</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Eder,+K">Kerstin Eder</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 40 pages, 13 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Machine Learning (cs.LG) </div> <p class='mathjax'> Microelectronic design verification remains a critical bottleneck in device development, traditionally mitigated by expanding verification teams and computational resources. Since the late 1990s, machine learning (ML) has been proposed to enhance verification efficiency, yet many techniques have not achieved mainstream adoption. This review, from the perspective of verification and ML practitioners, examines the application of ML in dynamic-based techniques for functional verification of microelectronic designs, and provides a starting point for those new to this interdisciplinary field. Historical trends, techniques, ML types, and evaluation baselines are analysed to understand why previous research has not been widely adopted in industry. The review highlights the application of ML, the techniques used and critically discusses their limitations and successes. Although there is a wealth of promising research, real-world adoption is hindered by challenges in comparing techniques, identifying suitable applications, and the expertise required for implementation. This review proposes that the field can progress through the creation and use of open datasets, common benchmarks, and verification targets. By establishing open evaluation criteria, industry can guide future research. Parallels with ML in software verification suggest potential for collaboration. Additionally, greater use of open-source designs and verification environments can allow more researchers from outside the hardware verification discipline to contribute to the challenge of verifying microelectronic designs. </p> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2503.11698" title="Abstract" id="2503.11698"> arXiv:2503.11698 </a> [<a href="/pdf/2503.11698" title="Download PDF" id="pdf-2503.11698" aria-labelledby="pdf-2503.11698">pdf</a>, <a href="https://arxiv.org/html/2503.11698v1" title="View HTML" id="html-2503.11698" aria-labelledby="html-2503.11698" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11698" title="Other formats" id="oth-2503.11698" aria-labelledby="oth-2503.11698">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Comparison of the Cerebras Wafer-Scale Integration Technology with Nvidia GPU-based Systems for Artificial Intelligence </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kundu,+Y">Yudhishthira Kundu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kaur,+M">Manroop Kaur</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wig,+T">Tripty Wig</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kumar,+K">Kriti Kumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kumari,+P">Pushpanjali Kumari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Puri,+V">Vivek Puri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Arora,+M">Manish Arora</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span> </div> <p class='mathjax'> Cerebras&#39; wafer-scale engine (WSE) technology merges multiple dies on a single wafer. It addresses the challenges of memory bandwidth, latency, and scalability, making it suitable for artificial intelligence. This work evaluates the WSE-3 architecture and compares it with leading GPU-based AI accelerators, notably Nvidia&#39;s H100 and B200. The work highlights the advantages of WSE-3 in performance per watt and memory scalability and provides insights into the challenges in manufacturing, thermal management, and reliability. The results suggest that wafer-scale integration can surpass conventional architectures in several metrics, though work is required to address cost-effectiveness and long-term viability. </p> </div> </dd> <dt> <a name='item12'>[12]</a> <a href ="/abs/2503.11707" title="Abstract" id="2503.11707"> arXiv:2503.11707 </a> [<a href="/pdf/2503.11707" title="Download PDF" id="pdf-2503.11707" aria-labelledby="pdf-2503.11707">pdf</a>, <a href="https://arxiv.org/html/2503.11707v1" title="View HTML" id="html-2503.11707" aria-labelledby="html-2503.11707" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11707" title="Other formats" id="oth-2503.11707" aria-labelledby="oth-2503.11707">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> EDEA: Efficient Dual-Engine Accelerator for Depthwise Separable Convolution with Direct Data Transfer </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Y">Yi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lou,+J">Jie Lou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wabnitz,+M">Malte Wabnitz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Loh,+J">Johnson Loh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gemmeke,+T">Tobias Gemmeke</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Distributed, Parallel, and Cluster Computing (cs.DC) </div> <p class='mathjax'> Depthwise separable convolution (DSC) has emerged as a crucial technique, especially for resource-constrained devices. In this paper, we propose a dual-engine for the DSC hardware accelerator, which enables the full utilization of depthwise convolution (DWC) and pointwise convolution (PWC) processing elements (PEs) in all DSC layers. To determine the optimal dataflow, data reuse, and configuration of the target architecture, we conduct a design space exploration using MobileNetV1 with the CIFAR10 dataset. In the architecture, we introduce an additional non-convolutional unit, which merges the dequantization, batch normalization (BN), ReLU, and quantization between DWC and PWC into a simple fixed-point multiplication and addition operation. This also reduces the intermediate data access between the DWC and PWC, enabling streaming operation and reducing latency. The proposed DSC dual-engine accelerator is implemented using the 22nm FDSOI technology from GlobalFoundries, occupying an area of 0.58 $mm^2$. After signoff, it can operate at 1 GHz at TT corner, achieving a peak energy efficiency of 13.43 TOPS/W with a throughput of 973.55 GOPS with 8-bit precision. The average energy efficiency of all DSC layers on MobileNetV1 is 11.13 TOPS/W, demonstrating substantial hardware efficiency improvements for DSC-based applications. </p> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2503.12512" title="Abstract" id="2503.12512"> arXiv:2503.12512 </a> [<a href="/pdf/2503.12512" title="Download PDF" id="pdf-2503.12512" aria-labelledby="pdf-2503.12512">pdf</a>, <a href="https://arxiv.org/html/2503.12512v1" title="View HTML" id="html-2503.12512" aria-labelledby="html-2503.12512" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12512" title="Other formats" id="oth-2503.12512" aria-labelledby="oth-2503.12512">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Systematic Approach for Multi-objective Double-side Clock Tree Synthesis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+X">Xun Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lu,+H">Haoran Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+Y">Yuxuan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+J">Jiarui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Guo,+Z">Zizheng Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+H">Heng Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+B">Bei Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lim,+S+K">Sung Kyu Lim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+R">Runsheng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+R">Ru Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+Y">Yibo Lin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span> </div> <p class='mathjax'> As the scaling of semiconductor devices nears its limits, utilizing the back-side space of silicon has emerged as a new trend for future integrated circuits. With intense interest, several works have hacked existing backend tools to explore the potential of synthesizing double-side clock trees via nano Through-Silicon-Vias (nTSVs). However, these works lack a systematic perspective on design resource allocation and multi-objective optimization. We propose a systematic approach to design clock trees with double-side metal layers, including hierarchical clock routing, concurrent buffers and nTSVs insertion, and skew refinement. Compared with the state-of-the-art (SOTA) methods, the widely-used open-source tool, our algorithm outperforms them in latency, skew, wirelength, and the number of buffers and nTSVs. </p> </div> </dd> <dt> <a name='item14'>[14]</a> <a href ="/abs/2503.12829" title="Abstract" id="2503.12829"> arXiv:2503.12829 </a> [<a href="/pdf/2503.12829" title="Download PDF" id="pdf-2503.12829" aria-labelledby="pdf-2503.12829">pdf</a>, <a href="https://arxiv.org/html/2503.12829v1" title="View HTML" id="html-2503.12829" aria-labelledby="html-2503.12829" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12829" title="Other formats" id="oth-2503.12829" aria-labelledby="oth-2503.12829">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SparseLUT: Sparse Connectivity Optimization for Lookup Table-based Deep Neural Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lou,+B">Binglei Lou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+R">Ruilin Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Leong,+P">Philip Leong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> The deployment of deep neural networks (DNNs) on resource-constrained edge devices such as field-programmable gate arrays (FPGAs) requires a careful balance of latency, power, and resource usage while maintaining high accuracy. Existing Lookup Table (LUT)-based DNNs, including LogicNets, PolyLUT, PolyLUT-Add, and NeuraLUT, exploit native FPGA resources with random sparse connectivity. This paper introduces SparseLUT, a connectivity-centric training technique tailored for LUT-based DNNs. SparseLUT leverages a non-greedy training strategy that prioritizes the pruning of less significant connections and strategically regrows alternative ones, resulting in efficient convergence to the target sparsity. Experimental results show consistent accuracy improvements across benchmarks, including up to a 2.13\% increase on MNIST and a 0.94\% improvement for Jet Substructure Classification compared to random sparsity. This is done without any hardware overhead and achieves state-of-the-art results for LUT-based DNNs. </p> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2503.12946" title="Abstract" id="2503.12946"> arXiv:2503.12946 </a> [<a href="/pdf/2503.12946" title="Download PDF" id="pdf-2503.12946" aria-labelledby="pdf-2503.12946">pdf</a>, <a href="https://arxiv.org/html/2503.12946v1" title="View HTML" id="html-2503.12946" aria-labelledby="html-2503.12946" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12946" title="Other formats" id="oth-2503.12946" aria-labelledby="oth-2503.12946">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Open3DBench: Open-Source Benchmark for 3D-IC Backend Implementation and PPA Evaluation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shi,+Y">Yunqi Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+C">Chengrui Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ren,+W">Wanqi Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+S">Siyuan Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xue,+K">Ke Xue</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yuan,+M">Mingxuan Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qian,+C">Chao Qian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhou,+Z">Zhi-Hua Zhou</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This work introduces Open3DBench, an open-source 3D-IC backend implementation benchmark built upon the OpenROAD-flow-scripts framework, enabling comprehensive evaluation of power, performance, area, and thermal metrics. Our proposed flow supports modular integration of 3D partitioning, placement, 3D routing, RC extraction, and thermal simulation, aligning with advanced 3D flows that rely on commercial tools and in-house scripts. We present two foundational 3D placement algorithms: Open3D-Tiling, which emphasizes regular macro placement, and Open3D-DMP, which enhances wirelength optimization through cross-die co-placement with analytical placer DREAMPlace. Experimental results show significant improvements in area (51.19%), wirelength (24.06%), timing (30.84%), and power (5.72%) compared to 2D flows. The results also highlight that better wirelength does not necessarily lead to PPA gain, emphasizing the need of developing PPA-driven methods. Open3DBench offers a standardized, reproducible platform for evaluating 3D EDA methods, effectively bridging the gap between open-source tools and commercial solutions in 3D-IC design. </p> </div> </dd> <dt> <a name='item16'>[16]</a> <a href ="/abs/2503.12988" title="Abstract" id="2503.12988"> arXiv:2503.12988 </a> [<a href="/pdf/2503.12988" title="Download PDF" id="pdf-2503.12988" aria-labelledby="pdf-2503.12988">pdf</a>, <a href="https://arxiv.org/html/2503.12988v1" title="View HTML" id="html-2503.12988" aria-labelledby="html-2503.12988" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12988" title="Other formats" id="oth-2503.12988" aria-labelledby="oth-2503.12988">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ROMA: a Read-Only-Memory-based Accelerator for QLoRA-based On-Device LLM </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+W">Wenqiang Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Y">Yijia Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Z">Zikai Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huo,+G">Guanting Huo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liang,+H">Hao Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cao,+S">Shijie Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+N">Ningyi Xu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> As large language models (LLMs) demonstrate powerful capabilities, deploying them on edge devices has become increasingly crucial, offering advantages in privacy and real-time interaction. QLoRA has emerged as the standard approach for on-device LLMs, leveraging quantized models to reduce memory and computational costs while utilizing LoRA for task-specific adaptability. In this work, we propose ROMA, a QLoRA accelerator with a hybrid storage architecture that uses ROM for quantized base models and SRAM for LoRA weights and KV cache. Our insight is that the quantized base model is stable and converged, making it well-suited for ROM storage. Meanwhile, LoRA modules offer the flexibility to adapt to new data without requiring updates to the base model. To further reduce the area cost of ROM, we introduce a novel B-ROM design and integrate it with the compute unit to form a fused cell for efficient use of chip resources. ROMA can effectively store both a 4-bit 3B and a 2-bit 8B LLaMA model entirely on-chip, achieving a notable generation speed exceeding 20,000 tokens/s without requiring external memory. </p> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2503.13064" title="Abstract" id="2503.13064"> arXiv:2503.13064 </a> [<a href="/pdf/2503.13064" title="Download PDF" id="pdf-2503.13064" aria-labelledby="pdf-2503.13064">pdf</a>, <a href="https://arxiv.org/html/2503.13064v1" title="View HTML" id="html-2503.13064" aria-labelledby="html-2503.13064" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.13064" title="Other formats" id="oth-2503.13064" aria-labelledby="oth-2503.13064">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> HERMES: High-Performance RISC-V Memory Hierarchy for ML Workloads </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Suryadevara,+P">Pranav Suryadevara</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 5 figures. Individual Project </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span>; Performance (cs.PF) </div> <p class='mathjax'> The growth of machine learning (ML) workloads has underscored the importance of efficient memory hierarchies to address bandwidth, latency, and scalability challenges. HERMES focuses on optimizing memory subsystems for RISC-V architectures to meet the computational needs of ML models such as CNNs, RNNs, and Transformers. This project explores state-of-the-art techniques such as advanced prefetching, tensor-aware caching, and hybrid memory models. The cornerstone of HERMES is the integration of shared L3 caches with fine-grained coherence protocols and specialized pathways to deep learning accelerators like Gemmini. Simulation tools like Gem5 and DRAMSim2 are used to evaluate baseline performance and scalability under representative ML workloads. The findings of this study highlight the design choices and anticipated challenges, paving the way for low-latency scalable memory operations for ML applications. </p> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2503.13105" title="Abstract" id="2503.13105"> arXiv:2503.13105 </a> [<a href="/pdf/2503.13105" title="Download PDF" id="pdf-2503.13105" aria-labelledby="pdf-2503.13105">pdf</a>, <a href="https://arxiv.org/html/2503.13105v1" title="View HTML" id="html-2503.13105" aria-labelledby="html-2503.13105" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.13105" title="Other formats" id="oth-2503.13105" aria-labelledby="oth-2503.13105">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Managing Hybrid Solid-State Drives Using Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wei,+Q">Qian Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+Y">Yi Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Z">Zehao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shen,+Z">Zhaoyan Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+D">Dongxiao Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+B">Bingzhe Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span> </div> <p class='mathjax'> Hybrid Solid-State Drives (SSDs), which integrate several types of flash cells (e.g., single-level cell (SLC) and multiple-level cell (MLC)) in a single drive and enable them to convert between each other, are designed to deliver both high performance and high storage capacity. However, compared to traditional SSDs, hybrid SSDs also introduce a much larger design space, resulting in higher optimization complexity due to more design factors involved, including flash conversion timing and data migration between different flash cells, etc. To address these challenges, large language models (LLMs) could be a promising technique, as they excel in handling complex, high-dimensional parameter space exploration by leveraging their advanced capability to identify patterns and optimize solutions. Recent works have started exploring the use of LLMs to optimize computer systems. However, to the best of our knowledge, no study has focused on optimizing SSDs with the assistance of LLMs. <br>In this work, we explore the potential of LLMs in understanding and efficiently managing hybrid SSD design space. Specifically, two important questions are exploited and analyzed: 1) Can LLMs offer optimization potential for Hybrid SSD management? 2) How to leverage LLMs for the performance and efficiency of hybrid SSD optimization? Based on the observations of exploration, we propose a comprehensive auto-tuning framework for hybrid SSDs, integrating LLMs to recommend customized configurations using calibration prompts derived from hardware, system, and workload information. Experimental results reveal a 62.35% improvement in throughput and a 57.99% decrease in write amplification compared to the default hybrid SSD configurations achieved with the incorporation of LLMs. </p> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2503.13301" title="Abstract" id="2503.13301"> arXiv:2503.13301 </a> [<a href="/pdf/2503.13301" title="Download PDF" id="pdf-2503.13301" aria-labelledby="pdf-2503.13301">pdf</a>, <a href="https://arxiv.org/html/2503.13301v1" title="View HTML" id="html-2503.13301" aria-labelledby="html-2503.13301" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.13301" title="Other formats" id="oth-2503.13301" aria-labelledby="oth-2503.13301">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LIMCA: LLM for Automating Analog In-Memory Computing Architecture Design Exploration </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Vungarala,+D">Deepak Vungarala</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Amin,+M+H">Md Hasibul Amin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mercati,+P">Pietro Mercati</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ghosh,+A">Arnob Ghosh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Roohi,+A">Arman Roohi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zand,+R">Ramtin Zand</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Angizi,+S">Shaahin Angizi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 4 Figures, 5 Tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span> </div> <p class='mathjax'> Resistive crossbars enabling analog In-Memory Computing (IMC) have emerged as a promising architecture for Deep Neural Network (DNN) acceleration, offering high memory bandwidth and in-situ computation. However, the manual, knowledge-intensive design process and the lack of high-quality circuit netlists have significantly constrained design space exploration and optimization to behavioral system-level tools. In this work, we introduce LIMCA, a novel fine-tune-free Large Language Model (LLM)-driven framework for automating the design and evaluation of IMC crossbar architectures. Unlike traditional approaches, LIMCA employs a No-Human-In-Loop (NHIL) automated pipeline to generate and validate circuit netlists for SPICE simulations, eliminating manual intervention. LIMCA systematically explores the IMC design space by leveraging a structured dataset and LLM-based performance evaluation. Our experimental results on MNIST classification demonstrate that LIMCA successfully generates crossbar designs achieving $\geq$96% accuracy while maintaining a power consumption $\leq$3W, making this the first work in LLM-assisted IMC design space exploration. Compared to existing frameworks, LIMCA provides an automated, scalable, and hardware-aware solution, reducing design exploration time while ensuring user-constrained performance trade-offs. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 1 of 1 entries)</h3> <dt> <a name='item20'>[20]</a> <a href ="/abs/2503.13116" title="Abstract" id="2503.13116"> arXiv:2503.13116 </a> (cross-list from cs.CR) [<a href="/pdf/2503.13116" title="Download PDF" id="pdf-2503.13116" aria-labelledby="pdf-2503.13116">pdf</a>, <a href="https://arxiv.org/html/2503.13116v1" title="View HTML" id="html-2503.13116" aria-labelledby="html-2503.13116" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.13116" title="Other formats" id="oth-2503.13116" aria-labelledby="oth-2503.13116">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VeriLeaky: Navigating IP Protection vs Utility in Fine-Tuning for LLM-Driven Verilog Coding </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Zeng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shao,+M">Minghao Shao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nabeel,+M">Mohammed Nabeel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Roy,+P+B">Prithwish Basu Roy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mankali,+L">Likhitha Mankali</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bhandari,+J">Jitendra Bhandari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Karri,+R">Ramesh Karri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sinanoglu,+O">Ozgur Sinanoglu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shafique,+M">Muhammad Shafique</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Knechtel,+J">Johann Knechtel</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Hardware Architecture (cs.AR); Machine Learning (cs.LG) </div> <p class='mathjax'> Large language models (LLMs) offer significant potential for coding, yet fine-tuning (FT) with curated data is essential for niche languages like Verilog. Using proprietary intellectual property (IP) for FT presents a serious risk, as FT data can be leaked through LLM inference. This leads to a critical dilemma for design houses: seeking to build externally accessible LLMs offering competitive Verilog coding, how can they leverage in-house IP to enhance FT utility while ensuring IP protection? <br>For the first time in the literature, we study this dilemma. Using LLaMA 3.1-8B, we conduct in-house FT on a baseline Verilog dataset (RTLCoder) supplemented with our own in-house IP, which is validated through multiple tape-outs. To rigorously assess IP leakage, we quantify structural similarity (AST/Dolos) and functional equivalence (Synopsys Formality) between generated codes and our in-house IP. We show that our IP can indeed be leaked, confirming the threat. As defense, we evaluate logic locking of Verilog codes (ASSURE). This offers some level of protection, yet reduces the IP&#39;s utility for FT and degrades the LLM&#39;s performance. Our study shows the need for novel strategies that are both effective and minimally disruptive to FT, an essential effort for enabling design houses to fully utilize their proprietary IP toward LLM-driven Verilog coding. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 5 of 5 entries)</h3> <dt> <a name='item21'>[21]</a> <a href ="/abs/2411.14299" title="Abstract" id="2411.14299"> arXiv:2411.14299 </a> (replaced) [<a href="/pdf/2411.14299" title="Download PDF" id="pdf-2411.14299" aria-labelledby="pdf-2411.14299">pdf</a>, <a href="https://arxiv.org/html/2411.14299v4" title="View HTML" id="html-2411.14299" aria-labelledby="html-2411.14299" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14299" title="Other formats" id="oth-2411.14299" aria-labelledby="oth-2411.14299">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Masala-CHAI: A Large-Scale SPICE Netlist Dataset for Analog Circuits by Harnessing AI </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bhandari,+J">Jitendra Bhandari</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bhat,+V">Vineet Bhat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=He,+Y">Yuheng He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rahmani,+H">Hamed Rahmani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Garg,+S">Siddharth Garg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Karri,+R">Ramesh Karri</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span> </div> <p class='mathjax'> Masala-CHAI is the first fully automated framework leveraging large language models (LLMs) to generate Simulation Programs with Integrated Circuit Emphasis (SPICE) netlists. It addresses a long-standing challenge in automating netlist generation for analog circuits within circuit design automation. Automating this workflow could accelerate the creation of finetuned LLMs for analog circuit design and verification. We identify key challenges in this automation and evaluate the multi-modal capabilities of state-of-the-art LLMs, particularly GPT-4, to address these issues. We propose a three-step workflow to overcome current limitations: labeling analog circuits, prompt tuning, and netlist verification. This approach aims to create an end-to-end SPICE netlist generator from circuit schematic images, tackling the long-standing hurdle of accurate netlist generation. Our framework demonstrates significant performance improvements, tested on approximately 2,100 schematics of varying complexity. We open-source this solution for community-driven development. </p> </div> </dd> <dt> <a name='item22'>[22]</a> <a href ="/abs/2503.01162" title="Abstract" id="2503.01162"> arXiv:2503.01162 </a> (replaced) [<a href="/pdf/2503.01162" title="Download PDF" id="pdf-2503.01162" aria-labelledby="pdf-2503.01162">pdf</a>, <a href="https://arxiv.org/html/2503.01162v2" title="View HTML" id="html-2503.01162" aria-labelledby="html-2503.01162" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.01162" title="Other formats" id="oth-2503.01162" aria-labelledby="oth-2503.01162">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CogSys: Efficient and Scalable Neurosymbolic Cognition System via Algorithm-Hardware Co-Design </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wan,+Z">Zishen Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+H">Hanchen Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Raj,+R">Ritik Raj</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+C">Che-Kai Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Samajdar,+A">Ananda Samajdar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Raychowdhury,+A">Arijit Raychowdhury</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Krishna,+T">Tushar Krishna</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 2025 IEEE International Symposium on High-Performance Computer Architecture (HPCA), 15 pages, 19 figures, 10 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span> </div> <p class='mathjax'> Neurosymbolic AI is an emerging compositional paradigm that fuses neural learning with symbolic reasoning to enhance the transparency, interpretability, and trustworthiness of AI. It also exhibits higher data efficiency making it promising for edge deployments. Despite the algorithmic promises and demonstrations, unfortunately executing neurosymbolic workloads on current hardware (CPU/GPU/TPU) is challenging due to higher memory intensity, greater compute heterogeneity and access pattern irregularity, leading to severe hardware underutilization. <br>This work proposes CogSys, a characterization and co-design framework dedicated to neurosymbolic AI system acceleration, aiming to win both reasoning efficiency and scalability. On the algorithm side, CogSys proposes an efficient factorization technique to alleviate compute and memory overhead. On the hardware side, CogSys proposes a scalable neurosymbolic architecture with reconfigurable neuro/symbolic processing elements (nsPE) and bubble streaming (BS) dataflow with spatial-temporal (ST) mapping for highly parallel and efficient neurosymbolic computation. On the system side, CogSys features an adaptive workload-aware scheduler (adSCH) to orchestrate heterogeneous kernels and enhance resource utilization. Evaluated across cognitive workloads, CogSys enables reconfigurable support for neural and symbolic kernels and exhibits &gt;75x speedup over TPU-like systolic array with only &lt;5% area overhead, as benchmarked under the TSMC 28nm technology node. CogSys achieves 4x-96x speedup compared to desktop and edge GPUs. For the first time, CogSys enables real-time abduction reasoning towards human fluid intelligence, requiring only 0.3 s per reasoning task with 4 mm2 area and 1.48 W power consumption. </p> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2503.09975" title="Abstract" id="2503.09975"> arXiv:2503.09975 </a> (replaced) [<a href="/pdf/2503.09975" title="Download PDF" id="pdf-2503.09975" aria-labelledby="pdf-2503.09975">pdf</a>, <a href="https://arxiv.org/html/2503.09975v3" title="View HTML" id="html-2503.09975" aria-labelledby="html-2503.09975" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.09975" title="Other formats" id="oth-2503.09975" aria-labelledby="oth-2503.09975">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Faster Inference of LLMs using FP8 on the Intel Gaudi </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lee,+J">Joonhyung Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Markovich-Golan,+S">Shmulik Markovich-Golan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ohayon,+D">Daniel Ohayon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hanani,+Y">Yair Hanani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Park,+G">Gunho Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kim,+B">Byeongwook Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Karnieli,+A">Asaf Karnieli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Livne,+U">Uri Livne</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shen,+H">Haihao Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+T">Tai Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kwon,+S+J">Se Jung Kwon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lee,+D">Dongsoo Lee</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Hardware Architecture (cs.AR)</span> </div> <p class='mathjax'> Low-precision data types are essential in modern neural networks during both training and inference as they enhance throughput and computational capacity by better exploiting available hardware resources. Despite the incorporation of FP8 in commercially available neural network accelerators, a comprehensive exposition of its underlying mechanisms, along with rigorous performance and accuracy evaluations, is still lacking. In this work, we contribute in three significant ways. First, we analyze the implementation details and quantization options associated with FP8 for inference on the Intel Gaudi AI accelerator. Second, we empirically quantify the throughput improvements afforded by the use of FP8 at both the operator level and in end-to-end scenarios. Third, we assess the accuracy impact of various FP8 quantization methods. Our experimental results indicate that the Intel Gaudi 2 accelerator consistently achieves high computational unit utilization, frequently exceeding 90% MFU, while incurring an accuracy degradation of less than 1%. </p> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2403.04714" title="Abstract" id="2403.04714"> arXiv:2403.04714 </a> (replaced) [<a href="/pdf/2403.04714" title="Download PDF" id="pdf-2403.04714" aria-labelledby="pdf-2403.04714">pdf</a>, <a href="https://arxiv.org/html/2403.04714v2" title="View HTML" id="html-2403.04714" aria-labelledby="html-2403.04714" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2403.04714" title="Other formats" id="oth-2403.04714" aria-labelledby="oth-2403.04714">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Parendi: Thousand-Way Parallel RTL Simulation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Emami,+M">Mahyar Emami</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bourgeat,+T">Thomas Bourgeat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Larus,+J">James Larus</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Distributed, Parallel, and Cluster Computing (cs.DC)</span>; Hardware Architecture (cs.AR) </div> <p class='mathjax'> Hardware development critically depends on cycle-accurate RTL simulation. However, as chip complexity increases, conventional single-threaded simulation becomes impractical due to stagnant single-core performance. <br>Parendi is an RTL simulator that addresses this challenge by exploiting the abundant fine-grained parallelism inherent in RTL simulation and efficiently mapping it onto the massively parallel Graphcore IPU (Intelligence Processing Unit) architecture. Parendi scales up to 5888 cores on 4 Graphcore IPU sockets. It allows us to run large RTL designs up to 4$\times$ faster than the most powerful state-of-the-art x64 multicore systems. <br>To achieve this performance, we developed new partitioning and compilation techniques and carefully quantified the synchronization, communication, and computation costs of parallel RTL simulation: The paper comprehensively analyzes these factors and details the strategies that Parendi uses to optimize them. </p> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2407.17432" title="Abstract" id="2407.17432"> arXiv:2407.17432 </a> (replaced) [<a href="/pdf/2407.17432" title="Download PDF" id="pdf-2407.17432" aria-labelledby="pdf-2407.17432">pdf</a>, <a href="https://arxiv.org/html/2407.17432v2" title="View HTML" id="html-2407.17432" aria-labelledby="html-2407.17432" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2407.17432" title="Other formats" id="oth-2407.17432" aria-labelledby="oth-2407.17432">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> An FPGA-Based Open-Source Hardware-Software Framework for Side-Channel Security Research </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zoni,+D">Davide Zoni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Galimberti,+A">Andrea Galimberti</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Galli,+D">Davide Galli</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 8 figures, 4 tables </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> IEEE Transactions on Computers, 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Cryptography and Security (cs.CR)</span>; Hardware Architecture (cs.AR) </div> <p class='mathjax'> Attacks based on side-channel analysis (SCA) pose a severe security threat to modern computing platforms, further exacerbated on IoT devices by their pervasiveness and handling of private and critical data. Designing SCA-resistant computing platforms requires a significant additional effort in the early stages of the IoT devices&#39; life cycle, which is severely constrained by strict time-to-market deadlines and tight budgets. This manuscript introduces a hardware-software framework meant for SCA research on FPGA targets. It delivers an IoT-class system-on-chip (SoC) that includes a RISC-V CPU, provides observability and controllability through an ad-hoc debug infrastructure to facilitate SCA attacks and evaluate the platform&#39;s security, and streamlines the deployment of SCA countermeasures through dedicated hardware and software features such as a DFS actuator and FreeRTOS support. The open-source release of the framework includes the SoC, the scripts to configure the computing platform, compile a target application, and assess the SCA security, as well as a suite of state-of-the-art attacks and countermeasures. The goal is to foster its adoption and novel developments in the field, empowering designers and researchers to focus on studying SCA countermeasures and Attacks while relying on a sound and stable hardware-software platform as the foundation for their research. </p> </div> </dd> </dl> <div class='paging'>Total of 25 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.AR/new?skip=0&amp;show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10