CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 402 results for author: <span class="mathjax">Garg, S</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Garg%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Garg, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Garg%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Garg, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Garg%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Garg%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Garg%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Garg%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Garg%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Garg%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08643">arXiv:2502.08643</a> <span> [<a href="https://arxiv.org/pdf/2502.08643">pdf</a>, <a href="https://arxiv.org/format/2502.08643">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Real-to-Sim-to-Real Approach to Robotic Manipulation with VLM-Generated Iterative Keypoint Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Patel%2C+S">Shivansh Patel</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xinchen Yin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenlong Huang</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shubham Garg</a>, <a href="/search/cs?searchtype=author&query=Nayyeri%2C+H">Hooshang Nayyeri</a>, <a href="/search/cs?searchtype=author&query=Fei-Fei%2C+L">Li Fei-Fei</a>, <a href="/search/cs?searchtype=author&query=Lazebnik%2C+S">Svetlana Lazebnik</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunzhu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08643v1-abstract-short" style="display: inline;"> Task specification for robotic manipulation in open-world environments is challenging, requiring flexible and adaptive objectives that align with human intentions and can evolve through iterative feedback. We introduce Iterative Keypoint Reward (IKER), a visually grounded, Python-based reward function that serves as a dynamic task specification. Our framework leverages VLMs to generate and refine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08643v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08643v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08643v1-abstract-full" style="display: none;"> Task specification for robotic manipulation in open-world environments is challenging, requiring flexible and adaptive objectives that align with human intentions and can evolve through iterative feedback. We introduce Iterative Keypoint Reward (IKER), a visually grounded, Python-based reward function that serves as a dynamic task specification. Our framework leverages VLMs to generate and refine these reward functions for multi-step manipulation tasks. Given RGB-D observations and free-form language instructions, we sample keypoints in the scene and generate a reward function conditioned on these keypoints. IKER operates on the spatial relationships between keypoints, leveraging commonsense priors about the desired behaviors, and enabling precise SE(3) control. We reconstruct real-world scenes in simulation and use the generated rewards to train reinforcement learning (RL) policies, which are then deployed into the real world-forming a real-to-sim-to-real loop. Our approach demonstrates notable capabilities across diverse scenarios, including both prehensile and non-prehensile tasks, showcasing multi-step task execution, spontaneous error recovery, and on-the-fly strategy adjustments. The results highlight IKER's effectiveness in enabling robots to perform multi-step tasks in dynamic environments through iterative reward shaping. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08643v1-abstract-full').style.display = 'none'; document.getElementById('2502.08643v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICRA 2025, Project Page: https://iker-robot.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00922">arXiv:2502.00922</a> <span> [<a href="https://arxiv.org/pdf/2502.00922">pdf</a>, <a href="https://arxiv.org/format/2502.00922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Huff-LLM: End-to-End Lossless Compression for Efficient LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yubeaton%2C+P">Patrick Yubeaton</a>, <a href="/search/cs?searchtype=author&query=Mahmoud%2C+T">Tareq Mahmoud</a>, <a href="/search/cs?searchtype=author&query=Naga%2C+S">Shehab Naga</a>, <a href="/search/cs?searchtype=author&query=Taheri%2C+P">Pooria Taheri</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tianhua Xia</a>, <a href="/search/cs?searchtype=author&query=George%2C+A">Arun George</a>, <a href="/search/cs?searchtype=author&query=Khalil%2C+Y">Yasmein Khalil</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S+Q">Sai Qian Zhang</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+S">Siddharth Joshi</a>, <a href="/search/cs?searchtype=author&query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00922v1-abstract-short" style="display: inline;"> As they become more capable, large language models (LLMs) have continued to rapidly increase in size. This has exacerbated the difficulty in running state of the art LLMs on small, edge devices. Standard techniques advocate solving this problem through lossy compression techniques such as quantization or pruning. However, such compression techniques are lossy, and have been shown to change model b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00922v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00922v1-abstract-full" style="display: none;"> As they become more capable, large language models (LLMs) have continued to rapidly increase in size. This has exacerbated the difficulty in running state of the art LLMs on small, edge devices. Standard techniques advocate solving this problem through lossy compression techniques such as quantization or pruning. However, such compression techniques are lossy, and have been shown to change model behavior in unpredictable manners. We propose Huff-LLM, an \emph{end-to-end, lossless} model compression method that lets users store LLM weights in compressed format \emph{everywhere} -- cloud, disk, main memory, and even in on-chip memory/buffers. This allows us to not only load larger models in main memory, but also reduces bandwidth required to load weights on chip, and makes more efficient use of on-chip weight buffers. In addition to the memory savings achieved via compression, we also show latency and energy efficiency improvements when performing inference with the compressed model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00922v1-abstract-full').style.display = 'none'; document.getElementById('2502.00922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17295">arXiv:2501.17295</a> <span> [<a href="https://arxiv.org/pdf/2501.17295">pdf</a>, <a href="https://arxiv.org/format/2501.17295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Hallucinated Translations in Large Language Models with Hallucination-focused Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zilu Tang</a>, <a href="/search/cs?searchtype=author&query=Chatterjee%2C+R">Rajen Chatterjee</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sarthak Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17295v1-abstract-short" style="display: inline;"> Machine Translation (MT) is undergoing a paradigm shift, with systems based on fine-tuned large language models (LLM) becoming increasingly competitive with traditional encoder-decoder models trained specifically for translation tasks. However, LLM-based systems are at a higher risk of generating hallucinations, which can severely undermine user's trust and safety. Most prior research on hallucina… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17295v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17295v1-abstract-full" style="display: none;"> Machine Translation (MT) is undergoing a paradigm shift, with systems based on fine-tuned large language models (LLM) becoming increasingly competitive with traditional encoder-decoder models trained specifically for translation tasks. However, LLM-based systems are at a higher risk of generating hallucinations, which can severely undermine user's trust and safety. Most prior research on hallucination mitigation focuses on traditional MT models, with solutions that involve post-hoc mitigation - detecting hallucinated translations and re-translating them. While effective, this approach introduces additional complexity in deploying extra tools in production and also increases latency. To address these limitations, we propose a method that intrinsically learns to mitigate hallucinations during the model training phase. Specifically, we introduce a data creation framework to generate hallucination focused preference datasets. Fine-tuning LLMs on these preference datasets reduces the hallucination rate by an average of 96% across five language pairs, while preserving overall translation quality. In a zero-shot setting our approach reduces hallucinations by 89% on an average across three unseen target languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17295v1-abstract-full').style.display = 'none'; document.getElementById('2501.17295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025 Main Conference Long paper (9 pages)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NAACL 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02464">arXiv:2501.02464</a> <span> [<a href="https://arxiv.org/pdf/2501.02464">pdf</a>, <a href="https://arxiv.org/format/2501.02464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Depth Any Camera: Zero-Shot Metric Depth Estimation from Any Camera </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuliang Guo</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sparsh Garg</a>, <a href="/search/cs?searchtype=author&query=Miangoleh%2C+S+M+H">S. Mahdi H. Miangoleh</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xinyu Huang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+L">Liu Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02464v1-abstract-short" style="display: inline;"> While recent depth estimation methods exhibit strong zero-shot generalization, achieving accurate metric depth across diverse camera types-particularly those with large fields of view (FoV) such as fisheye and 360-degree cameras-remains a significant challenge. This paper presents Depth Any Camera (DAC), a powerful zero-shot metric depth estimation framework that extends a perspective-trained mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02464v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02464v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02464v1-abstract-full" style="display: none;"> While recent depth estimation methods exhibit strong zero-shot generalization, achieving accurate metric depth across diverse camera types-particularly those with large fields of view (FoV) such as fisheye and 360-degree cameras-remains a significant challenge. This paper presents Depth Any Camera (DAC), a powerful zero-shot metric depth estimation framework that extends a perspective-trained model to effectively handle cameras with varying FoVs. The framework is designed to ensure that all existing 3D data can be leveraged, regardless of the specific camera types used in new applications. Remarkably, DAC is trained exclusively on perspective images but generalizes seamlessly to fisheye and 360-degree cameras without the need for specialized training data. DAC employs Equi-Rectangular Projection (ERP) as a unified image representation, enabling consistent processing of images with diverse FoVs. Its key components include a pitch-aware Image-to-ERP conversion for efficient online augmentation in ERP space, a FoV alignment operation to support effective training across a wide range of FoVs, and multi-resolution data augmentation to address resolution disparities between training and testing. DAC achieves state-of-the-art zero-shot metric depth estimation, improving delta-1 ($未_1$) accuracy by up to 50% on multiple fisheye and 360-degree datasets compared to prior metric depth foundation models, demonstrating robust generalization across camera types. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02464v1-abstract-full').style.display = 'none'; document.getElementById('2501.02464v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00921">arXiv:2501.00921</a> <span> [<a href="https://arxiv.org/pdf/2501.00921">pdf</a>, <a href="https://arxiv.org/format/2501.00921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Aligning Netlist to Source Code using SynAlign </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sakshi Garg</a>, <a href="/search/cs?searchtype=author&query=Renau%2C+J">Jose Renau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00921v1-abstract-short" style="display: inline;"> In current chip design processes, using multiple tools to obtain a gate-level netlist often results in the loss of source code correlation. SynAlign addresses this challenge by automating the alignment process, simplifying iterative design, reducing overhead, and maintaining correlation across various tools. This enhances the efficiency and effectiveness of chip design workflows. Improving chara… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00921v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00921v1-abstract-full" style="display: none;"> In current chip design processes, using multiple tools to obtain a gate-level netlist often results in the loss of source code correlation. SynAlign addresses this challenge by automating the alignment process, simplifying iterative design, reducing overhead, and maintaining correlation across various tools. This enhances the efficiency and effectiveness of chip design workflows. Improving characteristics such as frequency through iterative design is essential for enhancing accelerators and chip designs. While synthesis tools produce netlists with critical path information, designers often lack the tools to trace these netlist cells back to their original source code. Mapping netlist components to source code provides early feedback on timing and power for frontend designers. SynAlign automatically aligns post-optimized netlists with the original source code without altering compilers or synthesis processes. Its alignment strategy relies on the consistent design structure throughout the chip design cycle, even with changes in compiler flow. This consistency allows engineers to maintain a correlation between modified designs and the original source code across various tools. Remarkably, SynAlign can tolerate up to 61\% design net changes without impacting alignment accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00921v1-abstract-full').style.display = 'none'; document.getElementById('2501.00921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17009">arXiv:2412.17009</a> <span> [<a href="https://arxiv.org/pdf/2412.17009">pdf</a>, <a href="https://arxiv.org/format/2412.17009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generate to Discriminate: Expert Routing for Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Byun%2C+Y">Yewon Byun</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+S+V">Sanket Vaibhav Mehta</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Saurabh Garg</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Oberst%2C+M">Michael Oberst</a>, <a href="/search/cs?searchtype=author&query=Wilder%2C+B">Bryan Wilder</a>, <a href="/search/cs?searchtype=author&query=Lipton%2C+Z+C">Zachary C. Lipton</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17009v2-abstract-short" style="display: inline;"> In many real-world settings, regulations and economic incentives permit the sharing of models but not data across institutional boundaries. In such scenarios, practitioners might hope to adapt models to new domains, without losing performance on previous domains (so-called catastrophic forgetting). While any single model may struggle to achieve this goal, learning an ensemble of domain-specific ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17009v2-abstract-full').style.display = 'inline'; document.getElementById('2412.17009v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17009v2-abstract-full" style="display: none;"> In many real-world settings, regulations and economic incentives permit the sharing of models but not data across institutional boundaries. In such scenarios, practitioners might hope to adapt models to new domains, without losing performance on previous domains (so-called catastrophic forgetting). While any single model may struggle to achieve this goal, learning an ensemble of domain-specific experts offers the potential to adapt more closely to each individual institution. However, a core challenge in this context is determining which expert to deploy at test time. In this paper, we propose Generate to Discriminate (G2D), a domain-incremental continual learning method that leverages synthetic data to train a domain-discriminator that routes samples at inference time to the appropriate expert. Surprisingly, we find that leveraging synthetic data in this capacity is more effective than using the samples to \textit{directly} train the downstream classifier (the more common approach to leveraging synthetic data in the lifelong learning literature). We observe that G2D outperforms competitive domain-incremental learning methods on tasks in both vision and language modalities, providing a new perspective on the use of synthetic data in the lifelong learning literature. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17009v2-abstract-full').style.display = 'none'; document.getElementById('2412.17009v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15349">arXiv:2412.15349</a> <span> [<a href="https://arxiv.org/pdf/2412.15349">pdf</a>, <a href="https://arxiv.org/format/2412.15349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Urban Planning: A Hybrid Framework for Balanced City Development </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Singla%2C+P">Pratham Singla</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Ayush Singh</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Adesh Gupta</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shivank Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15349v1-abstract-short" style="display: inline;"> Urban planning faces a critical challenge in balancing city-wide infrastructure needs with localized demographic preferences, particularly in rapidly developing regions. Although existing approaches typically focus on top-down optimization or bottom-up community planning, only some frameworks successfully integrate both perspectives. Our methodology employs a two-tier approach: First, a determinis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15349v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15349v1-abstract-full" style="display: none;"> Urban planning faces a critical challenge in balancing city-wide infrastructure needs with localized demographic preferences, particularly in rapidly developing regions. Although existing approaches typically focus on top-down optimization or bottom-up community planning, only some frameworks successfully integrate both perspectives. Our methodology employs a two-tier approach: First, a deterministic solver optimizes basic infrastructure requirements in the city region. Second, four specialized planning agents, each representing distinct sub-regions, propose demographic-specific modifications to a master planner. The master planner then evaluates and integrates these suggestions to ensure cohesive urban development. We validate our framework using a newly created dataset comprising detailed region and sub-region maps from three developing cities in India, focusing on areas undergoing rapid urbanization. The results demonstrate that this hybrid approach enables more nuanced urban development while maintaining overall city functionality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15349v1-abstract-full').style.display = 'none'; document.getElementById('2412.15349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10594">arXiv:2412.10594</a> <span> [<a href="https://arxiv.org/pdf/2412.10594">pdf</a>, <a href="https://arxiv.org/format/2412.10594">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Unified Benchmark and Models for Multi-Modal Perceptual Metrics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ghazanfari%2C+S">Sara Ghazanfari</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&query=Flammarion%2C+N">Nicolas Flammarion</a>, <a href="/search/cs?searchtype=author&query=Krishnamurthy%2C+P">Prashanth Krishnamurthy</a>, <a href="/search/cs?searchtype=author&query=Khorrami%2C+F">Farshad Khorrami</a>, <a href="/search/cs?searchtype=author&query=Croce%2C+F">Francesco Croce</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10594v1-abstract-short" style="display: inline;"> Human perception of similarity across uni- and multimodal inputs is highly complex, making it challenging to develop automated metrics that accurately mimic it. General purpose vision-language models, such as CLIP and large multi-modal models (LMMs), can be applied as zero-shot perceptual metrics, and several recent works have developed models specialized in narrow perceptual tasks. However, the e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10594v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10594v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10594v1-abstract-full" style="display: none;"> Human perception of similarity across uni- and multimodal inputs is highly complex, making it challenging to develop automated metrics that accurately mimic it. General purpose vision-language models, such as CLIP and large multi-modal models (LMMs), can be applied as zero-shot perceptual metrics, and several recent works have developed models specialized in narrow perceptual tasks. However, the extent to which existing perceptual metrics align with human perception remains unclear. To investigate this question, we introduce UniSim-Bench, a benchmark encompassing 7 multi-modal perceptual similarity tasks, with a total of 25 datasets. Our evaluation reveals that while general-purpose models perform reasonably well on average, they often lag behind specialized models on individual tasks. Conversely, metrics fine-tuned for specific tasks fail to generalize well to unseen, though related, tasks. As a first step towards a unified multi-task perceptual similarity metric, we fine-tune both encoder-based and generative vision-language models on a subset of the UniSim-Bench tasks. This approach yields the highest average performance, and in some cases, even surpasses taskspecific models. Nevertheless, these models still struggle with generalization to unseen tasks, highlighting the ongoing challenge of learning a robust, unified perceptual similarity metric capable of capturing the human notion of similarity. The code and models are available at https://github.com/SaraGhazanfari/UniSim. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10594v1-abstract-full').style.display = 'none'; document.getElementById('2412.10594v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09379">arXiv:2412.09379</a> <span> [<a href="https://arxiv.org/pdf/2412.09379">pdf</a>, <a href="https://arxiv.org/format/2412.09379">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Hybrid variable spiking graph neural networks for energy-efficient scientific machine learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jain%2C+I">Isha Jain</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shailesh Garg</a>, <a href="/search/cs?searchtype=author&query=Shriyam%2C+S">Shaurya Shriyam</a>, <a href="/search/cs?searchtype=author&query=Chakraborty%2C+S">Souvik Chakraborty</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09379v1-abstract-short" style="display: inline;"> Graph-based representations for samples of computational mechanics-related datasets can prove instrumental when dealing with problems like irregular domains or molecular structures of materials, etc. To effectively analyze and process such datasets, deep learning offers Graph Neural Networks (GNNs) that utilize techniques like message-passing within their architecture. The issue, however, is that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09379v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09379v1-abstract-full" style="display: none;"> Graph-based representations for samples of computational mechanics-related datasets can prove instrumental when dealing with problems like irregular domains or molecular structures of materials, etc. To effectively analyze and process such datasets, deep learning offers Graph Neural Networks (GNNs) that utilize techniques like message-passing within their architecture. The issue, however, is that as the individual graph scales and/ or GNN architecture becomes increasingly complex, the increased energy budget of the overall deep learning model makes it unsustainable and restricts its applications in applications like edge computing. To overcome this, we propose in this paper Hybrid Variable Spiking Graph Neural Networks (HVS-GNNs) that utilize Variable Spiking Neurons (VSNs) within their architecture to promote sparse communication and hence reduce the overall energy budget. VSNs, while promoting sparse event-driven computations, also perform well for regression tasks, which are often encountered in computational mechanics applications and are the main target of this paper. Three examples dealing with prediction of mechanical properties of material based on microscale/ mesoscale structures are shown to test the performance of the proposed HVS-GNNs in regression tasks. We have also compared the performance of HVS-GNN architectures with the performance of vanilla GNNs and GNNs utilizing leaky integrate and fire neurons. The results produced show that HVS-GNNs perform well for regression tasks, all while promoting sparse communication and, hence, energy efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09379v1-abstract-full').style.display = 'none'; document.getElementById('2412.09379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09369">arXiv:2412.09369</a> <span> [<a href="https://arxiv.org/pdf/2412.09369">pdf</a>, <a href="https://arxiv.org/format/2412.09369">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Distribution free uncertainty quantification in neuroscience-inspired deep operators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shailesh Garg</a>, <a href="/search/cs?searchtype=author&query=Chakraborty%2C+S">Souvik Chakraborty</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09369v1-abstract-short" style="display: inline;"> Energy-efficient deep learning algorithms are essential for a sustainable future and feasible edge computing setups. Spiking neural networks (SNNs), inspired from neuroscience, are a positive step in the direction of achieving the required energy efficiency. However, in a bid to lower the energy requirements, accuracy is marginally sacrificed. Hence, predictions of such deep learning algorithms re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09369v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09369v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09369v1-abstract-full" style="display: none;"> Energy-efficient deep learning algorithms are essential for a sustainable future and feasible edge computing setups. Spiking neural networks (SNNs), inspired from neuroscience, are a positive step in the direction of achieving the required energy efficiency. However, in a bid to lower the energy requirements, accuracy is marginally sacrificed. Hence, predictions of such deep learning algorithms require an uncertainty measure that can inform users regarding the bounds of a certain output. In this paper, we introduce the Conformalized Randomized Prior Operator (CRP-O) framework that leverages Randomized Prior (RP) networks and Split Conformal Prediction (SCP) to quantify uncertainty in both conventional and spiking neural operators. To further enable zero-shot super-resolution in UQ, we propose an extension incorporating Gaussian Process Regression. This enhanced super-resolution-enabled CRP-O framework is integrated with the recently developed Variable Spiking Wavelet Neural Operator (VSWNO). To test the performance of the obtained calibrated uncertainty bounds, we discuss four different examples covering both one-dimensional and two-dimensional partial differential equations. Results demonstrate that the uncertainty bounds produced by the conformalized RP-VSWNO significantly enhance UQ estimates compared to vanilla RP-VSWNO, Quantile WNO (Q-WNO), and Conformalized Quantile WNO (CQ-WNO). These findings underscore the potential of the proposed approach for practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09369v1-abstract-full').style.display = 'none'; document.getElementById('2412.09369v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06827">arXiv:2412.06827</a> <span> [<a href="https://arxiv.org/pdf/2412.06827">pdf</a>, <a href="https://arxiv.org/format/2412.06827">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing LLMs for Physics Problem-Solving using Reinforcement Learning with Human-AI Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Anand%2C+A">Avinash Anand</a>, <a href="/search/cs?searchtype=author&query=Prasad%2C+K">Kritarth Prasad</a>, <a href="/search/cs?searchtype=author&query=Kirtani%2C+C">Chhavi Kirtani</a>, <a href="/search/cs?searchtype=author&query=Nair%2C+A+R">Ashwin R Nair</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+M">Mohit Gupta</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Saloni Garg</a>, <a href="/search/cs?searchtype=author&query=Gautam%2C+A">Anurag Gautam</a>, <a href="/search/cs?searchtype=author&query=Buldeo%2C+S">Snehal Buldeo</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+R+R">Rajiv Ratn Shah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06827v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated strong capabilities in text-based tasks but struggle with the complex reasoning required for physics problems, particularly in advanced arithmetic and conceptual understanding. While some research has explored ways to enhance LLMs in physics education using techniques such as prompt engineering and Retrieval Augmentation Generation (RAG), not enough e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06827v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06827v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06827v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated strong capabilities in text-based tasks but struggle with the complex reasoning required for physics problems, particularly in advanced arithmetic and conceptual understanding. While some research has explored ways to enhance LLMs in physics education using techniques such as prompt engineering and Retrieval Augmentation Generation (RAG), not enough effort has been made in addressing their limitations in physics reasoning. This paper presents a novel approach to improving LLM performance on physics questions using Reinforcement Learning with Human and Artificial Intelligence Feedback (RLHAIF). We evaluate several reinforcement learning methods, including Proximal Policy Optimization (PPO), Direct Preference Optimization (DPO), and Remax optimization. These methods are chosen to investigate RL policy performance with different settings on the PhyQA dataset, which includes challenging physics problems from high school textbooks. Our RLHAIF model, tested on leading LLMs like LLaMA2 and Mistral, achieved superior results, notably with the MISTRAL-PPO model, demonstrating marked improvements in reasoning and accuracy. It achieved high scores, with a 58.67 METEOR score and a 0.74 Reasoning score, making it a strong example for future physics reasoning research in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06827v1-abstract-full').style.display = 'none'; document.getElementById('2412.06827v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06168">arXiv:2412.06168</a> <span> [<a href="https://arxiv.org/pdf/2412.06168">pdf</a>, <a href="https://arxiv.org/format/2412.06168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Out-of-Distribution Detection with Overlap Index </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+H">Hao Fu</a>, <a href="/search/cs?searchtype=author&query=Krishnamurthy%2C+P">Prashanth Krishnamurthy</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&query=Khorrami%2C+F">Farshad Khorrami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06168v1-abstract-short" style="display: inline;"> Out-of-distribution (OOD) detection is crucial for the deployment of machine learning models in the open world. While existing OOD detectors are effective in identifying OOD samples that deviate significantly from in-distribution (ID) data, they often come with trade-offs. For instance, deep OOD detectors usually suffer from high computational costs, require tuning hyperparameters, and have limite… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06168v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06168v1-abstract-full" style="display: none;"> Out-of-distribution (OOD) detection is crucial for the deployment of machine learning models in the open world. While existing OOD detectors are effective in identifying OOD samples that deviate significantly from in-distribution (ID) data, they often come with trade-offs. For instance, deep OOD detectors usually suffer from high computational costs, require tuning hyperparameters, and have limited interpretability, whereas traditional OOD detectors may have a low accuracy on large high-dimensional datasets. To address these limitations, we propose a novel effective OOD detection approach that employs an overlap index (OI)-based confidence score function to evaluate the likelihood of a given input belonging to the same distribution as the available ID samples. The proposed OI-based confidence score function is non-parametric, lightweight, and easy to interpret, hence providing strong flexibility and generality. Extensive empirical evaluations indicate that our OI-based OOD detector is competitive with state-of-the-art OOD detectors in terms of detection accuracy on a wide range of datasets while requiring less computation and memory costs. Lastly, we show that the proposed OI-based confidence score function inherits nice properties from OI (e.g., insensitivity to small distributional variations and robustness against Huber $蔚$-contamination) and is a versatile tool for estimating OI and model accuracy in specific contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06168v1-abstract-full').style.display = 'none'; document.getElementById('2412.06168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02594">arXiv:2412.02594</a> <span> [<a href="https://arxiv.org/pdf/2412.02594">pdf</a>, <a href="https://arxiv.org/format/2412.02594">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PrefixLLM: LLM-aided Prefix Circuit Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Weihua Xiao</a>, <a href="/search/cs?searchtype=author&query=Putrevu%2C+V+S+C">Venkata Sai Charan Putrevu</a>, <a href="/search/cs?searchtype=author&query=Hemadri%2C+R+V">Raghu Vamshi Hemadri</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&query=Karri%2C+R">Ramesh Karri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02594v1-abstract-short" style="display: inline;"> Prefix circuits are fundamental components in digital adders, widely used in digital systems due to their efficiency in calculating carry signals. Synthesizing prefix circuits with minimized area and delay is crucial for enhancing the performance of modern computing systems. Recently, large language models (LLMs) have demonstrated a surprising ability to perform text generation tasks. We propose P… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02594v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02594v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02594v1-abstract-full" style="display: none;"> Prefix circuits are fundamental components in digital adders, widely used in digital systems due to their efficiency in calculating carry signals. Synthesizing prefix circuits with minimized area and delay is crucial for enhancing the performance of modern computing systems. Recently, large language models (LLMs) have demonstrated a surprising ability to perform text generation tasks. We propose PrefixLLM, that leverages LLMs for prefix circuit synthesis. PrefixLLM transforms the prefix circuit synthesis task into a structured text generation problem, termed the Structured Prefix Circuit Representation (SPCR), and introduces an iterative framework to automatically and accurately generate valid SPCRs. We further present a design space exploration (DSE) framework that uses LLMs to iteratively search for area and delay optimized prefix circuits. Compared to state-of-the-art, PrefixLLM can reduce the area by 3.70% under the same delay constraint. This work highlights the use of LLMs in the synthesis of arithmetic circuits, which can be transformed into the structured text generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02594v1-abstract-full').style.display = 'none'; document.getElementById('2412.02594v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01042">arXiv:2412.01042</a> <span> [<a href="https://arxiv.org/pdf/2412.01042">pdf</a>, <a href="https://arxiv.org/format/2412.01042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TruncFormer: Private LLM Inference Using Only Truncations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yubeaton%2C+P">Patrick Yubeaton</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+J+C">Jianqiao Cambridge Mo</a>, <a href="/search/cs?searchtype=author&query=Garimella%2C+K">Karthik Garimella</a>, <a href="/search/cs?searchtype=author&query=Jha%2C+N+K">Nandan Kumar Jha</a>, <a href="/search/cs?searchtype=author&query=Reagen%2C+B">Brandon Reagen</a>, <a href="/search/cs?searchtype=author&query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01042v1-abstract-short" style="display: inline;"> Private inference (PI) serves an important role in guaranteeing the privacy of user data when interfacing with proprietary machine learning models such as LLMs. However, PI remains practically intractable due to the massive latency costs associated with nonlinear functions present in LLMs. Existing works have focused on improving latency of specific LLM nonlinearities (such as the Softmax, or the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01042v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01042v1-abstract-full" style="display: none;"> Private inference (PI) serves an important role in guaranteeing the privacy of user data when interfacing with proprietary machine learning models such as LLMs. However, PI remains practically intractable due to the massive latency costs associated with nonlinear functions present in LLMs. Existing works have focused on improving latency of specific LLM nonlinearities (such as the Softmax, or the GeLU) via approximations. However, new types of nonlinearities are regularly introduced with new LLM architectures, and this has led to a constant game of catch-up where PI researchers attempt to optimize the newest nonlinear function. We introduce TruncFormer, a framework for taking any LLM and transforming it into a plaintext emulation of PI. Our framework leverages the fact that nonlinearities in LLMs are differentiable and can be accurately approximated with a sequence of additions, multiplications, and truncations. Further, we decouple the add/multiply and truncation operations, and statically determine where truncations should be inserted based on a given field size and input representation size. This leads to latency improvements over existing cryptographic protocols that enforce truncation after every multiplication operation. We open source our code for community use. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01042v1-abstract-full').style.display = 'none'; document.getElementById('2412.01042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00214">arXiv:2412.00214</a> <span> [<a href="https://arxiv.org/pdf/2412.00214">pdf</a>, <a href="https://arxiv.org/format/2412.00214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> C2HLSC: Leveraging Large Language Models to Bridge the Software-to-Hardware Design Gap </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Collini%2C+L">Luca Collini</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&query=Karri%2C+R">Ramesh Karri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00214v1-abstract-short" style="display: inline;"> High-Level Synthesis (HLS) tools offer rapid hardware design from C code, but their compatibility is limited by code constructs. This paper investigates Large Language Models (LLMs) for automatically refactoring C code into HLS-compatible formats. We present a case study using an LLM to rewrite C code for NIST 800-22 randomness tests, a QuickSort algorithm, and AES-128 into HLS-synthesizable C. Th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00214v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00214v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00214v1-abstract-full" style="display: none;"> High-Level Synthesis (HLS) tools offer rapid hardware design from C code, but their compatibility is limited by code constructs. This paper investigates Large Language Models (LLMs) for automatically refactoring C code into HLS-compatible formats. We present a case study using an LLM to rewrite C code for NIST 800-22 randomness tests, a QuickSort algorithm, and AES-128 into HLS-synthesizable C. The LLM iteratively transforms the C code guided by the, implementing functions like streaming data and hardware-specific signals. With the hindsight obtained from the case study, we implement a fully automated framework to refactor C code into HLS-compatible formats using LLMs. To tackle complex designs, we implement a preprocessing step that breaks down the hierarchy in order to approach the problem in a divide-and-conquer bottom-up way. We validated our framework on three ciphers, one hash function, five NIST 800-22 randomness tests, and a QuickSort algorithm. Our results show a high success rate on benchmarks that are orders of magnitude more complex than what has been achieved generating Verilog with LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00214v1-abstract-full').style.display = 'none'; document.getElementById('2412.00214v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18913">arXiv:2411.18913</a> <span> [<a href="https://arxiv.org/pdf/2411.18913">pdf</a>, <a href="https://arxiv.org/format/2411.18913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Planning Shorter Paths in Graphs of Convex Sets by Undistorting Parametrized Configuration Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shruti Garg</a>, <a href="/search/cs?searchtype=author&query=Cohn%2C+T">Thomas Cohn</a>, <a href="/search/cs?searchtype=author&query=Tedrake%2C+R">Russ Tedrake</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18913v1-abstract-short" style="display: inline;"> Optimization based motion planning provides a useful modeling framework through various costs and constraints. Using Graph of Convex Sets (GCS) for trajectory optimization gives guarantees of feasibility and optimality by representing configuration space as the finite union of convex sets. Nonlinear parametrizations can be used to extend this technique to handle cases such as kinematic loops, but… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18913v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18913v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18913v1-abstract-full" style="display: none;"> Optimization based motion planning provides a useful modeling framework through various costs and constraints. Using Graph of Convex Sets (GCS) for trajectory optimization gives guarantees of feasibility and optimality by representing configuration space as the finite union of convex sets. Nonlinear parametrizations can be used to extend this technique to handle cases such as kinematic loops, but this distorts distances, such that solving with convex objectives will yield paths that are suboptimal in the original space. We present a method to extend GCS to nonconvex objectives, allowing us to "undistort" the optimization landscape while maintaining feasibility guarantees. We demonstrate our method's efficacy on three different robotic planning domains: a bimanual robot moving an object with both arms, the set of 3D rotations using Euler angles, and a rational parametrization of kinematics that enables certifying regions as collision free. Across the board, our method significantly improves path length and trajectory duration with only a minimal increase in runtime. Website: https://shrutigarg914.github.io/pgd-gcs-results/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18913v1-abstract-full').style.display = 'none'; document.getElementById('2411.18913v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18479">arXiv:2411.18479</a> <span> [<a href="https://arxiv.org/pdf/2411.18479">pdf</a>, <a href="https://arxiv.org/format/2411.18479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SoK: Watermarking for AI-Generated Content </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xuandong Zhao</a>, <a href="/search/cs?searchtype=author&query=Gunn%2C+S">Sam Gunn</a>, <a href="/search/cs?searchtype=author&query=Christ%2C+M">Miranda Christ</a>, <a href="/search/cs?searchtype=author&query=Fairoze%2C+J">Jaiden Fairoze</a>, <a href="/search/cs?searchtype=author&query=Fabrega%2C+A">Andres Fabrega</a>, <a href="/search/cs?searchtype=author&query=Carlini%2C+N">Nicholas Carlini</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sanjam Garg</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+S">Sanghyun Hong</a>, <a href="/search/cs?searchtype=author&query=Nasr%2C+M">Milad Nasr</a>, <a href="/search/cs?searchtype=author&query=Tramer%2C+F">Florian Tramer</a>, <a href="/search/cs?searchtype=author&query=Jha%2C+S">Somesh Jha</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu-Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Dawn Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18479v2-abstract-short" style="display: inline;"> As the outputs of generative AI (GenAI) techniques improve in quality, it becomes increasingly challenging to distinguish them from human-created content. Watermarking schemes are a promising approach to address the problem of distinguishing between AI and human-generated content. These schemes embed hidden signals within AI-generated content to enable reliable detection. While watermarking is not… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18479v2-abstract-full').style.display = 'inline'; document.getElementById('2411.18479v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18479v2-abstract-full" style="display: none;"> As the outputs of generative AI (GenAI) techniques improve in quality, it becomes increasingly challenging to distinguish them from human-created content. Watermarking schemes are a promising approach to address the problem of distinguishing between AI and human-generated content. These schemes embed hidden signals within AI-generated content to enable reliable detection. While watermarking is not a silver bullet for addressing all risks associated with GenAI, it can play a crucial role in enhancing AI safety and trustworthiness by combating misinformation and deception. This paper presents a comprehensive overview of watermarking techniques for GenAI, beginning with the need for watermarking from historical and regulatory perspectives. We formalize the definitions and desired properties of watermarking schemes and examine the key objectives and threat models for existing approaches. Practical evaluation strategies are also explored, providing insights into the development of robust watermarking techniques capable of resisting various attacks. Additionally, we review recent representative works, highlight open challenges, and discuss potential directions for this emerging field. By offering a thorough understanding of watermarking in GenAI, this work aims to guide researchers in advancing watermarking methods and applications, and support policymakers in addressing the broader implications of GenAI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18479v2-abstract-full').style.display = 'none'; document.getElementById('2411.18479v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15804">arXiv:2411.15804</a> <span> [<a href="https://arxiv.org/pdf/2411.15804">pdf</a>, <a href="https://arxiv.org/format/2411.15804">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LoRA-Mini : Adaptation Matrices Decomposition and Selective Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Singh%2C+A">Ayush Singh</a>, <a href="/search/cs?searchtype=author&query=Aher%2C+R">Rajdeep Aher</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shivank Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15804v1-abstract-short" style="display: inline;"> The rapid advancements in large language models (LLMs) have revolutionized natural language processing, creating an increased need for efficient, task-specific fine-tuning methods. Traditional fine-tuning of LLMs involves updating a large number of parameters, which is computationally expensive and memory-intensive. Low-Rank Adaptation (LoRA) has emerged as a promising solution, enabling parameter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15804v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15804v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15804v1-abstract-full" style="display: none;"> The rapid advancements in large language models (LLMs) have revolutionized natural language processing, creating an increased need for efficient, task-specific fine-tuning methods. Traditional fine-tuning of LLMs involves updating a large number of parameters, which is computationally expensive and memory-intensive. Low-Rank Adaptation (LoRA) has emerged as a promising solution, enabling parameter-efficient fine-tuning by reducing the number of trainable parameters. However, while LoRA reduces the number of trainable parameters, LoRA modules still create significant storage challenges. We propose LoRA-Mini, an optimized adaptation of LoRA that improves parameter efficiency by splitting low-rank matrices into four parts, with only the two inner matrices being trainable. This approach achieves upto a 20x reduction compared to standard LoRA in the number of trainable parameters while preserving performance levels comparable to standard LoRA, addressing both computational and storage efficiency in LLM fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15804v1-abstract-full').style.display = 'none'; document.getElementById('2411.15804v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14299">arXiv:2411.14299</a> <span> [<a href="https://arxiv.org/pdf/2411.14299">pdf</a>, <a href="https://arxiv.org/format/2411.14299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Masala-CHAI: A Large-Scale SPICE Netlist Dataset for Analog Circuits by Harnessing AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhandari%2C+J">Jitendra Bhandari</a>, <a href="/search/cs?searchtype=author&query=Bhat%2C+V">Vineet Bhat</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuheng He</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&query=Rahmani%2C+H">Hamed Rahmani</a>, <a href="/search/cs?searchtype=author&query=Karri%2C+R">Ramesh Karri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14299v3-abstract-short" style="display: inline;"> Masala-CHAI is the first fully automated framework leveraging large language models (LLMs) to generate Simulation Programs with Integrated Circuit Emphasis (SPICE) netlists. It addresses a long-standing challenge in automating netlist generation for analog circuits within circuit design automation. Automating this workflow could accelerate the creation of finetuned LLMs for analog circuit design a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14299v3-abstract-full').style.display = 'inline'; document.getElementById('2411.14299v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14299v3-abstract-full" style="display: none;"> Masala-CHAI is the first fully automated framework leveraging large language models (LLMs) to generate Simulation Programs with Integrated Circuit Emphasis (SPICE) netlists. It addresses a long-standing challenge in automating netlist generation for analog circuits within circuit design automation. Automating this workflow could accelerate the creation of finetuned LLMs for analog circuit design and verification. We identify key challenges in this automation and evaluate the multi-modal capabilities of state-of-the-art LLMs, particularly GPT-4, to address these issues. We propose a three-step workflow to overcome current limitations: labeling analog circuits, prompt tuning, and netlist verification. This approach aims to create an end-to-end SPICE netlist generator from circuit schematic images, tackling the long-standing hurdle of accurate netlist generation. Our framework demonstrates significant performance improvements, tested on approximately 2,100 schematics of varying complexity. We open-source this solution for community-driven development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14299v3-abstract-full').style.display = 'none'; document.getElementById('2411.14299v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11856">arXiv:2411.11856</a> <span> [<a href="https://arxiv.org/pdf/2411.11856">pdf</a>, <a href="https://arxiv.org/format/2411.11856">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Can EDA Tool Feedback Improve Verilog Generation by LLMs? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Blocklove%2C+J">Jason Blocklove</a>, <a href="/search/cs?searchtype=author&query=Thakur%2C+S">Shailja Thakur</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+B">Benjamin Tan</a>, <a href="/search/cs?searchtype=author&query=Pearce%2C+H">Hammond Pearce</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&query=Karri%2C+R">Ramesh Karri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11856v1-abstract-short" style="display: inline;"> Traditionally, digital hardware designs are written in the Verilog hardware description language (HDL) and debugged manually by engineers. This can be time-consuming and error-prone for complex designs. Large Language Models (LLMs) are emerging as a potential tool to help generate fully functioning HDL code, but most works have focused on generation in the single-shot capacity: i.e., run and evalu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11856v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11856v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11856v1-abstract-full" style="display: none;"> Traditionally, digital hardware designs are written in the Verilog hardware description language (HDL) and debugged manually by engineers. This can be time-consuming and error-prone for complex designs. Large Language Models (LLMs) are emerging as a potential tool to help generate fully functioning HDL code, but most works have focused on generation in the single-shot capacity: i.e., run and evaluate, a process that does not leverage debugging and as such does not adequately reflect a realistic development process. In this work we evaluate the ability of LLMs to leverage feedback from electronic design automation (EDA) tools to fix mistakes in their own generated Verilog. To accomplish this we present an open-source, highly customizable framework, AutoChip, which combines conversational LLMs with the output from Verilog compilers and simulations to iteratively generate and repair Verilog. To determine the success of these LLMs we leverage the VerilogEval benchmark set. We evaluate four state-of-the-art conversational LLMs, focusing on readily accessible commercial models. EDA tool feedback proved to be consistently more effective than zero-shot prompting only with GPT-4o, the most computationally complex model we evaluated. In the best case we observed a 5.8% increase in the number of successful designs with a 34.2% decrease in cost over the best zero-shot results. Mixing smaller models with this larger model at the end of the feedback iterations resulted in equally as much success as with GPT-4o using feedback, but for an additional 41.9% less cost (overall decrease in cost over zero-shot of 89.6%). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11856v1-abstract-full').style.display = 'none'; document.getElementById('2411.11856v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08870">arXiv:2411.08870</a> <span> [<a href="https://arxiv.org/pdf/2411.08870">pdf</a>, <a href="https://arxiv.org/format/2411.08870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Limited Impact of Medical Adaptation of Large Language and Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+D+P">Daniel P. Jeong</a>, <a href="/search/cs?searchtype=author&query=Mani%2C+P">Pranav Mani</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Saurabh Garg</a>, <a href="/search/cs?searchtype=author&query=Lipton%2C+Z+C">Zachary C. Lipton</a>, <a href="/search/cs?searchtype=author&query=Oberst%2C+M">Michael Oberst</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08870v1-abstract-short" style="display: inline;"> Several recent works seek to develop foundation models specifically for medical applications, adapting general-purpose large language models (LLMs) and vision-language models (VLMs) via continued pretraining on publicly available biomedical corpora. These works typically claim that such domain-adaptive pretraining (DAPT) improves performance on downstream medical tasks, such as answering medical l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08870v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08870v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08870v1-abstract-full" style="display: none;"> Several recent works seek to develop foundation models specifically for medical applications, adapting general-purpose large language models (LLMs) and vision-language models (VLMs) via continued pretraining on publicly available biomedical corpora. These works typically claim that such domain-adaptive pretraining (DAPT) improves performance on downstream medical tasks, such as answering medical licensing exam questions. In this paper, we compare ten public "medical" LLMs and two VLMs against their corresponding base models, arriving at a different conclusion: all medical VLMs and nearly all medical LLMs fail to consistently improve over their base models in the zero-/few-shot prompting and supervised fine-tuning regimes for medical question-answering (QA). For instance, across all tasks and model pairs we consider in the 3-shot setting, medical LLMs only outperform their base models in 22.7% of cases, reach a (statistical) tie in 36.8% of cases, and are significantly worse than their base models in the remaining 40.5% of cases. Our conclusions are based on (i) comparing each medical model head-to-head, directly against the corresponding base model; (ii) optimizing the prompts for each model separately in zero-/few-shot prompting; and (iii) accounting for statistical uncertainty in comparisons. While these basic practices are not consistently adopted in the literature, our ablations show that they substantially impact conclusions. Meanwhile, we find that after fine-tuning on specific QA tasks, medical LLMs can show performance improvements, but the benefits do not carry over to tasks based on clinical notes. Our findings suggest that state-of-the-art general-domain models may already exhibit strong medical knowledge and reasoning capabilities, and offer recommendations to strengthen the conclusions of future studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08870v1-abstract-full').style.display = 'none'; document.getElementById('2411.08870v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended version of EMNLP 2024 paper arXiv:2411.04118. Includes additional results on clinical note QA tasks and supervised fine-tuning evaluations</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05831">arXiv:2411.05831</a> <span> [<a href="https://arxiv.org/pdf/2411.05831">pdf</a>, <a href="https://arxiv.org/format/2411.05831">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> To Ask or Not to Ask? Detecting Absence of Information in Vision and Language Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abraham%2C+S+S">Savitha Sam Abraham</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sourav Garg</a>, <a href="/search/cs?searchtype=author&query=Dayoub%2C+F">Feras Dayoub</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05831v1-abstract-short" style="display: inline;"> Recent research in Vision Language Navigation (VLN) has overlooked the development of agents' inquisitive abilities, which allow them to ask clarifying questions when instructions are incomplete. This paper addresses how agents can recognize "when" they lack sufficient information, without focusing on "what" is missing, particularly in VLN tasks with vague instructions. Equipping agents with this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05831v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05831v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05831v1-abstract-full" style="display: none;"> Recent research in Vision Language Navigation (VLN) has overlooked the development of agents' inquisitive abilities, which allow them to ask clarifying questions when instructions are incomplete. This paper addresses how agents can recognize "when" they lack sufficient information, without focusing on "what" is missing, particularly in VLN tasks with vague instructions. Equipping agents with this ability enhances efficiency by reducing potential digressions and seeking timely assistance. The challenge in identifying such uncertain points is balancing between being overly cautious (high recall) and overly confident (high precision). We propose an attention-based instruction-vagueness estimation module that learns associations between instructions and the agent's trajectory. By leveraging instruction-to-path alignment information during training, the module's vagueness estimation performance improves by around 52% in terms of precision-recall balance. In our ablative experiments, we also demonstrate the effectiveness of incorporating this additional instruction-to-path attention network alongside the cross-modal attention networks within the navigator module. Our results show that the attention scores from the instruction-to-path attention network serve as better indicators for estimating vagueness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05831v1-abstract-full').style.display = 'none'; document.getElementById('2411.05831v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05359">arXiv:2411.05359</a> <span> [<a href="https://arxiv.org/pdf/2411.05359">pdf</a>, <a href="https://arxiv.org/format/2411.05359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Agricultural Landscape Understanding At Country-Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dua%2C+R">Radhika Dua</a>, <a href="/search/cs?searchtype=author&query=Saxena%2C+N">Nikita Saxena</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+A">Aditi Agarwal</a>, <a href="/search/cs?searchtype=author&query=Wilson%2C+A">Alex Wilson</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+G">Gaurav Singh</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+H">Hoang Tran</a>, <a href="/search/cs?searchtype=author&query=Deshpande%2C+I">Ishan Deshpande</a>, <a href="/search/cs?searchtype=author&query=Kaur%2C+A">Amandeep Kaur</a>, <a href="/search/cs?searchtype=author&query=Aggarwal%2C+G">Gaurav Aggarwal</a>, <a href="/search/cs?searchtype=author&query=Nath%2C+C">Chandan Nath</a>, <a href="/search/cs?searchtype=author&query=Basu%2C+A">Arnab Basu</a>, <a href="/search/cs?searchtype=author&query=Batchu%2C+V">Vishal Batchu</a>, <a href="/search/cs?searchtype=author&query=Holla%2C+S">Sharath Holla</a>, <a href="/search/cs?searchtype=author&query=Kurle%2C+B">Bindiya Kurle</a>, <a href="/search/cs?searchtype=author&query=Missura%2C+O">Olana Missura</a>, <a href="/search/cs?searchtype=author&query=Aggarwal%2C+R">Rahul Aggarwal</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shubhika Garg</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+N">Nishi Shah</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Avneet Singh</a>, <a href="/search/cs?searchtype=author&query=Tewari%2C+D">Dinesh Tewari</a>, <a href="/search/cs?searchtype=author&query=Dondzik%2C+A">Agata Dondzik</a>, <a href="/search/cs?searchtype=author&query=Adsul%2C+B">Bharat Adsul</a>, <a href="/search/cs?searchtype=author&query=Sohoni%2C+M">Milind Sohoni</a>, <a href="/search/cs?searchtype=author&query=Praveen%2C+A+R">Asim Rama Praveen</a>, <a href="/search/cs?searchtype=author&query=Dangi%2C+A">Aaryan Dangi</a> , et al. (10 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05359v1-abstract-short" style="display: inline;"> Agricultural landscapes are quite complex, especially in the Global South where fields are smaller, and agricultural practices are more varied. In this paper we report on our progress in digitizing the agricultural landscape (natural and man-made) in our study region of India. We use high resolution imagery and a UNet style segmentation model to generate the first of its kind national-scale multi-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05359v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05359v1-abstract-full" style="display: none;"> Agricultural landscapes are quite complex, especially in the Global South where fields are smaller, and agricultural practices are more varied. In this paper we report on our progress in digitizing the agricultural landscape (natural and man-made) in our study region of India. We use high resolution imagery and a UNet style segmentation model to generate the first of its kind national-scale multi-class panoptic segmentation output. Through this work we have been able to identify individual fields across 151.7M hectares, and delineating key features such as water resources and vegetation. We share how this output was validated by our team and externally by downstream users, including some sample use cases that can lead to targeted data driven decision making. We believe this dataset will contribute towards digitizing agriculture by generating the foundational baselayer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05359v1-abstract-full').style.display = 'none'; document.getElementById('2411.05359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages, 7 tables, 15 figs</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04118">arXiv:2411.04118</a> <span> [<a href="https://arxiv.org/pdf/2411.04118">pdf</a>, <a href="https://arxiv.org/format/2411.04118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Medical Adaptation of Large Language and Vision-Language Models: Are We Making Progress? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+D+P">Daniel P. Jeong</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Saurabh Garg</a>, <a href="/search/cs?searchtype=author&query=Lipton%2C+Z+C">Zachary C. Lipton</a>, <a href="/search/cs?searchtype=author&query=Oberst%2C+M">Michael Oberst</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04118v2-abstract-short" style="display: inline;"> Several recent works seek to develop foundation models specifically for medical applications, adapting general-purpose large language models (LLMs) and vision-language models (VLMs) via continued pretraining on publicly available biomedical corpora. These works typically claim that such domain-adaptive pretraining (DAPT) improves performance on downstream medical tasks, such as answering medical l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04118v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04118v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04118v2-abstract-full" style="display: none;"> Several recent works seek to develop foundation models specifically for medical applications, adapting general-purpose large language models (LLMs) and vision-language models (VLMs) via continued pretraining on publicly available biomedical corpora. These works typically claim that such domain-adaptive pretraining (DAPT) improves performance on downstream medical tasks, such as answering medical licensing exam questions. In this paper, we compare seven public "medical" LLMs and two VLMs against their corresponding base models, arriving at a different conclusion: all medical VLMs and nearly all medical LLMs fail to consistently improve over their base models in the zero-/few-shot prompting regime for medical question-answering (QA) tasks. For instance, across the tasks and model pairs we consider in the 3-shot setting, medical LLMs only outperform their base models in 12.1% of cases, reach a (statistical) tie in 49.8% of cases, and are significantly worse than their base models in the remaining 38.2% of cases. Our conclusions are based on (i) comparing each medical model head-to-head, directly against the corresponding base model; (ii) optimizing the prompts for each model separately; and (iii) accounting for statistical uncertainty in comparisons. While these basic practices are not consistently adopted in the literature, our ablations show that they substantially impact conclusions. Our findings suggest that state-of-the-art general-domain models may already exhibit strong medical knowledge and reasoning capabilities, and offer recommendations to strengthen the conclusions of future studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04118v2-abstract-full').style.display = 'none'; document.getElementById('2411.04118v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This version was published at EMNLP 2024 Main Conference as a Long Paper (Oral). See the extended version (arXiv:2411.08870) for additional results on QA tasks based on clinical notes and evaluations in the supervised fine-tuning regime</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03253">arXiv:2411.03253</a> <span> [<a href="https://arxiv.org/pdf/2411.03253">pdf</a>, <a href="https://arxiv.org/format/2411.03253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Discovering Data Structures: Nearest Neighbor Search and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Salemohamed%2C+O">Omar Salemohamed</a>, <a href="/search/cs?searchtype=author&query=Charlin%2C+L">Laurent Charlin</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shivam Garg</a>, <a href="/search/cs?searchtype=author&query=Sharan%2C+V">Vatsal Sharan</a>, <a href="/search/cs?searchtype=author&query=Valiant%2C+G">Gregory Valiant</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03253v1-abstract-short" style="display: inline;"> We propose a general framework for end-to-end learning of data structures. Our framework adapts to the underlying data distribution and provides fine-grained control over query and space complexity. Crucially, the data structure is learned from scratch, and does not require careful initialization or seeding with candidate data structures/algorithms. We first apply this framework to the problem of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03253v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03253v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03253v1-abstract-full" style="display: none;"> We propose a general framework for end-to-end learning of data structures. Our framework adapts to the underlying data distribution and provides fine-grained control over query and space complexity. Crucially, the data structure is learned from scratch, and does not require careful initialization or seeding with candidate data structures/algorithms. We first apply this framework to the problem of nearest neighbor search. In several settings, we are able to reverse-engineer the learned data structures and query algorithms. For 1D nearest neighbor search, the model discovers optimal distribution (in)dependent algorithms such as binary search and variants of interpolation search. In higher dimensions, the model learns solutions that resemble k-d trees in some regimes, while in others, they have elements of locality-sensitive hashing. The model can also learn useful representations of high-dimensional data and exploit them to design effective data structures. We also adapt our framework to the problem of estimating frequencies over a data stream, and believe it could also be a powerful discovery tool for new problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03253v1-abstract-full').style.display = 'none'; document.getElementById('2411.03253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24019">arXiv:2410.24019</a> <span> [<a href="https://arxiv.org/pdf/2410.24019">pdf</a>, <a href="https://arxiv.org/format/2410.24019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speech is More Than Words: Do Speech-to-Text Translation Systems Leverage Prosody? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tsiamas%2C+I">Ioannis Tsiamas</a>, <a href="/search/cs?searchtype=author&query=Sperber%2C+M">Matthias Sperber</a>, <a href="/search/cs?searchtype=author&query=Finch%2C+A">Andrew Finch</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sarthak Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24019v1-abstract-short" style="display: inline;"> The prosody of a spoken utterance, including features like stress, intonation and rhythm, can significantly affect the underlying semantics, and as a consequence can also affect its textual translation. Nevertheless, prosody is rarely studied within the context of speech-to-text translation (S2TT) systems. In particular, end-to-end (E2E) systems have been proposed as well-suited for prosody-aware… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24019v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24019v1-abstract-full" style="display: none;"> The prosody of a spoken utterance, including features like stress, intonation and rhythm, can significantly affect the underlying semantics, and as a consequence can also affect its textual translation. Nevertheless, prosody is rarely studied within the context of speech-to-text translation (S2TT) systems. In particular, end-to-end (E2E) systems have been proposed as well-suited for prosody-aware translation because they have direct access to the speech signal when making translation decisions, but the understanding of whether this is successful in practice is still limited. A main challenge is the difficulty of evaluating prosody awareness in translation. To address this challenge, we introduce an evaluation methodology and a focused benchmark (named ContraProST) aimed at capturing a wide range of prosodic phenomena. Our methodology uses large language models and controllable text-to-speech (TTS) to generate contrastive examples. Through experiments in translating English speech into German, Spanish, and Japanese, we find that (a) S2TT models possess some internal representation of prosody, but the prosody signal is often not strong enough to affect the translations, (b) E2E systems outperform cascades of speech recognition and text translation systems, confirming their theoretical advantage in this regard, and (c) certain cascaded systems also capture prosodic information in the translation, but only to a lesser extent that depends on the particulars of the transcript's surface form. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24019v1-abstract-full').style.display = 'none'; document.getElementById('2410.24019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WMT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23571">arXiv:2410.23571</a> <span> [<a href="https://arxiv.org/pdf/2410.23571">pdf</a>, <a href="https://arxiv.org/ps/2410.23571">ps</a>, <a href="https://arxiv.org/format/2410.23571">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Dual Agent Learning Based Aerial Trajectory Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shaswat Garg</a>, <a href="/search/cs?searchtype=author&query=Masnavi%2C+H">Houman Masnavi</a>, <a href="/search/cs?searchtype=author&query=Fidan%2C+B">Baris Fidan</a>, <a href="/search/cs?searchtype=author&query=Janabi-Sharifi%2C+F">Farrokh Janabi-Sharifi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23571v1-abstract-short" style="display: inline;"> This paper presents a novel reinforcement learning framework for trajectory tracking of unmanned aerial vehicles in cluttered environments using a dual-agent architecture. Traditional optimization methods for trajectory tracking face significant computational challenges and lack robustness in dynamic environments. Our approach employs deep reinforcement learning (RL) to overcome these limitations,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23571v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23571v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23571v1-abstract-full" style="display: none;"> This paper presents a novel reinforcement learning framework for trajectory tracking of unmanned aerial vehicles in cluttered environments using a dual-agent architecture. Traditional optimization methods for trajectory tracking face significant computational challenges and lack robustness in dynamic environments. Our approach employs deep reinforcement learning (RL) to overcome these limitations, leveraging 3D pointcloud data to perceive the environment without relying on memory-intensive obstacle representations like occupancy grids. The proposed system features two RL agents: one for predicting UAV velocities to follow a reference trajectory and another for managing collision avoidance in the presence of obstacles. This architecture ensures real-time performance and adaptability to uncertainties. We demonstrate the efficacy of our approach through simulated and real-world experiments, highlighting improvements over state-of-the-art RL and optimization-based methods. Additionally, a curriculum learning paradigm is employed to scale the algorithms to more complex environments, ensuring robust trajectory tracking and obstacle avoidance in both static and dynamic scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23571v1-abstract-full').style.display = 'none'; document.getElementById('2410.23571v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23232">arXiv:2410.23232</a> <span> [<a href="https://arxiv.org/pdf/2410.23232">pdf</a>, <a href="https://arxiv.org/format/2410.23232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Attribute-to-Delete: Machine Unlearning via Datamodel Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Georgiev%2C+K">Kristian Georgiev</a>, <a href="/search/cs?searchtype=author&query=Rinberg%2C+R">Roy Rinberg</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S+M">Sung Min Park</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shivam Garg</a>, <a href="/search/cs?searchtype=author&query=Ilyas%2C+A">Andrew Ilyas</a>, <a href="/search/cs?searchtype=author&query=Madry%2C+A">Aleksander Madry</a>, <a href="/search/cs?searchtype=author&query=Neel%2C+S">Seth Neel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23232v2-abstract-short" style="display: inline;"> Machine unlearning -- efficiently removing the effect of a small "forget set" of training data on a pre-trained machine learning model -- has recently attracted significant research interest. Despite this interest, however, recent work shows that existing machine unlearning techniques do not hold up to thorough evaluation in non-convex settings. In this work, we introduce a new machine unlearning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23232v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23232v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23232v2-abstract-full" style="display: none;"> Machine unlearning -- efficiently removing the effect of a small "forget set" of training data on a pre-trained machine learning model -- has recently attracted significant research interest. Despite this interest, however, recent work shows that existing machine unlearning techniques do not hold up to thorough evaluation in non-convex settings. In this work, we introduce a new machine unlearning technique that exhibits strong empirical performance even in such challenging settings. Our starting point is the perspective that the goal of unlearning is to produce a model whose outputs are statistically indistinguishable from those of a model re-trained on all but the forget set. This perspective naturally suggests a reduction from the unlearning problem to that of data attribution, where the goal is to predict the effect of changing the training set on a model's outputs. Thus motivated, we propose the following meta-algorithm, which we call Datamodel Matching (DMM): given a trained model, we (a) use data attribution to predict the output of the model if it were re-trained on all but the forget set points; then (b) fine-tune the pre-trained model to match these predicted outputs. In a simple convex setting, we show how this approach provably outperforms a variety of iterative unlearning algorithms. Empirically, we use a combination of existing evaluations and a new metric based on the KL-divergence to show that even in non-convex settings, DMM achieves strong unlearning performance relative to existing algorithms. An added benefit of DMM is that it is a meta-algorithm, in the sense that future advances in data attribution translate directly into better unlearning algorithms, pointing to a clear direction for future progress in unlearning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23232v2-abstract-full').style.display = 'none'; document.getElementById('2410.23232v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22606">arXiv:2410.22606</a> <span> [<a href="https://arxiv.org/pdf/2410.22606">pdf</a>, <a href="https://arxiv.org/ps/2410.22606">ps</a>, <a href="https://arxiv.org/format/2410.22606">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Testing Tensor Products of Algebraic Codes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sumegha Garg</a>, <a href="/search/cs?searchtype=author&query=Sudan%2C+M">Madhu Sudan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Gabriel Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22606v1-abstract-short" style="display: inline;"> Motivated by recent advances in locally testable codes and quantum LDPCs based on robust testability of tensor product codes, we explore the local testability of tensor products of (an abstraction of) algebraic geometry codes. Such codes are parameterized by, in addition to standard parameters such as block length $n$ and dimension $k$, their genus $g$. We show that the tensor product of two algeb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22606v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22606v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22606v1-abstract-full" style="display: none;"> Motivated by recent advances in locally testable codes and quantum LDPCs based on robust testability of tensor product codes, we explore the local testability of tensor products of (an abstraction of) algebraic geometry codes. Such codes are parameterized by, in addition to standard parameters such as block length $n$ and dimension $k$, their genus $g$. We show that the tensor product of two algebraic geometry codes is robustly locally testable provided $n = 惟((k+g)^2)$. Apart from Reed-Solomon codes, this seems to be the first explicit family of codes of super-constant dual distance that is robustly locally testable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22606v1-abstract-full').style.display = 'none'; document.getElementById('2410.22606v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22029">arXiv:2410.22029</a> <span> [<a href="https://arxiv.org/pdf/2410.22029">pdf</a>, <a href="https://arxiv.org/format/2410.22029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Are VLMs Really Blind </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Singh%2C+A">Ayush Singh</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+M">Mansi Gupta</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shivank Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22029v1-abstract-short" style="display: inline;"> Vision Language Models excel in handling a wide range of complex tasks, including Optical Character Recognition (OCR), Visual Question Answering (VQA), and advanced geometric reasoning. However, these models fail to perform well on low-level basic visual tasks which are especially easy for humans. Our goal in this work was to determine if these models are truly "blind" to geometric reasoning or if… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22029v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22029v1-abstract-full" style="display: none;"> Vision Language Models excel in handling a wide range of complex tasks, including Optical Character Recognition (OCR), Visual Question Answering (VQA), and advanced geometric reasoning. However, these models fail to perform well on low-level basic visual tasks which are especially easy for humans. Our goal in this work was to determine if these models are truly "blind" to geometric reasoning or if there are ways to enhance their capabilities in this area. Our work presents a novel automatic pipeline designed to extract key information from images in response to specific questions. Instead of just relying on direct VQA, we use question-derived keywords to create a caption that highlights important details in the image related to the question. This caption is then used by a language model to provide a precise answer to the question without requiring external fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22029v1-abstract-full').style.display = 'none'; document.getElementById('2410.22029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15314">arXiv:2410.15314</a> <span> [<a href="https://arxiv.org/pdf/2410.15314">pdf</a>, <a href="https://arxiv.org/format/2410.15314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> KTCR: Improving Implicit Hate Detection with Knowledge Transfer driven Concept Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+S">Samarth Garg</a>, <a href="/search/cs?searchtype=author&query=Kavuri%2C+V+H">Vivek Hruday Kavuri</a>, <a href="/search/cs?searchtype=author&query=Shroff%2C+G">Gargi Shroff</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+R">Rahul Mishra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15314v1-abstract-short" style="display: inline;"> The constant shifts in social and political contexts, driven by emerging social movements and political events, lead to new forms of hate content and previously unrecognized hate patterns that machine learning models may not have captured. Some recent literature proposes the data augmentation-based techniques to enrich existing hate datasets by incorporating samples that reveal new implicit hate p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15314v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15314v1-abstract-full" style="display: none;"> The constant shifts in social and political contexts, driven by emerging social movements and political events, lead to new forms of hate content and previously unrecognized hate patterns that machine learning models may not have captured. Some recent literature proposes the data augmentation-based techniques to enrich existing hate datasets by incorporating samples that reveal new implicit hate patterns. This approach aims to improve the model's performance on out-of-domain implicit hate instances. It is observed, that further addition of more samples for augmentation results in the decrease of the performance of the model. In this work, we propose a Knowledge Transfer-driven Concept Refinement method that distills and refines the concepts related to implicit hate samples through novel prototype alignment and concept losses, alongside data augmentation based on concept activation vectors. Experiments with several publicly available datasets show that incorporating additional implicit samples reflecting new hate patterns through concept refinement enhances the model's performance, surpassing baseline results while maintaining cross-dataset generalization capabilities.\footnote{DISCLAIMER: This paper contains explicit statements that are potentially offensive.} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15314v1-abstract-full').style.display = 'none'; document.getElementById('2410.15314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures, 2 algorithms, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12124">arXiv:2410.12124</a> <span> [<a href="https://arxiv.org/pdf/2410.12124">pdf</a>, <a href="https://arxiv.org/format/2410.12124">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Affordance-Centric Policy Learning: Sample Efficient and Generalisable Robot Policy Learning using Affordance-Centric Task Frames </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rana%2C+K">Krishan Rana</a>, <a href="/search/cs?searchtype=author&query=Abou-Chakra%2C+J">Jad Abou-Chakra</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sourav Garg</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+R">Robert Lee</a>, <a href="/search/cs?searchtype=author&query=Reid%2C+I">Ian Reid</a>, <a href="/search/cs?searchtype=author&query=Suenderhauf%2C+N">Niko Suenderhauf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12124v1-abstract-short" style="display: inline;"> Affordances are central to robotic manipulation, where most tasks can be simplified to interactions with task-specific regions on objects. By focusing on these key regions, we can abstract away task-irrelevant information, simplifying the learning process, and enhancing generalisation. In this paper, we propose an affordance-centric policy-learning approach that centres and appropriately \textit{o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12124v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12124v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12124v1-abstract-full" style="display: none;"> Affordances are central to robotic manipulation, where most tasks can be simplified to interactions with task-specific regions on objects. By focusing on these key regions, we can abstract away task-irrelevant information, simplifying the learning process, and enhancing generalisation. In this paper, we propose an affordance-centric policy-learning approach that centres and appropriately \textit{orients} a \textit{task frame} on these affordance regions allowing us to achieve both \textbf{intra-category invariance} -- where policies can generalise across different instances within the same object category -- and \textbf{spatial invariance} -- which enables consistent performance regardless of object placement in the environment. We propose a method to leverage existing generalist large vision models to extract and track these affordance frames, and demonstrate that our approach can learn manipulation tasks using behaviour cloning from as little as 10 demonstrations, with equivalent generalisation to an image-based policy trained on 305 demonstrations. We provide video demonstrations on our project site: https://affordance-policy.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12124v1-abstract-full').style.display = 'none'; document.getElementById('2410.12124v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Video can be found on our project website: https://affordance-policy.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07073">arXiv:2410.07073</a> <span> [<a href="https://arxiv.org/pdf/2410.07073">pdf</a>, <a href="https://arxiv.org/format/2410.07073">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Pixtral 12B </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Agrawal%2C+P">Pravesh Agrawal</a>, <a href="/search/cs?searchtype=author&query=Antoniak%2C+S">Szymon Antoniak</a>, <a href="/search/cs?searchtype=author&query=Hanna%2C+E+B">Emma Bou Hanna</a>, <a href="/search/cs?searchtype=author&query=Bout%2C+B">Baptiste Bout</a>, <a href="/search/cs?searchtype=author&query=Chaplot%2C+D">Devendra Chaplot</a>, <a href="/search/cs?searchtype=author&query=Chudnovsky%2C+J">Jessica Chudnovsky</a>, <a href="/search/cs?searchtype=author&query=Costa%2C+D">Diogo Costa</a>, <a href="/search/cs?searchtype=author&query=De+Monicault%2C+B">Baudouin De Monicault</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Saurabh Garg</a>, <a href="/search/cs?searchtype=author&query=Gervet%2C+T">Theophile Gervet</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+S">Soham Ghosh</a>, <a href="/search/cs?searchtype=author&query=H%C3%A9liou%2C+A">Am茅lie H茅liou</a>, <a href="/search/cs?searchtype=author&query=Jacob%2C+P">Paul Jacob</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+A+Q">Albert Q. Jiang</a>, <a href="/search/cs?searchtype=author&query=Khandelwal%2C+K">Kartik Khandelwal</a>, <a href="/search/cs?searchtype=author&query=Lacroix%2C+T">Timoth茅e Lacroix</a>, <a href="/search/cs?searchtype=author&query=Lample%2C+G">Guillaume Lample</a>, <a href="/search/cs?searchtype=author&query=Casas%2C+D+L">Diego Las Casas</a>, <a href="/search/cs?searchtype=author&query=Lavril%2C+T">Thibaut Lavril</a>, <a href="/search/cs?searchtype=author&query=Scao%2C+T+L">Teven Le Scao</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+A">Andy Lo</a>, <a href="/search/cs?searchtype=author&query=Marshall%2C+W">William Marshall</a>, <a href="/search/cs?searchtype=author&query=Martin%2C+L">Louis Martin</a>, <a href="/search/cs?searchtype=author&query=Mensch%2C+A">Arthur Mensch</a>, <a href="/search/cs?searchtype=author&query=Muddireddy%2C+P">Pavankumar Muddireddy</a> , et al. (17 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07073v2-abstract-short" style="display: inline;"> We introduce Pixtral-12B, a 12--billion-parameter multimodal language model. Pixtral-12B is trained to understand both natural images and documents, achieving leading performance on various multimodal benchmarks, surpassing a number of larger models. Unlike many open-source models, Pixtral is also a cutting-edge text model for its size, and does not compromise on natural language performance to ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07073v2-abstract-full').style.display = 'inline'; document.getElementById('2410.07073v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07073v2-abstract-full" style="display: none;"> We introduce Pixtral-12B, a 12--billion-parameter multimodal language model. Pixtral-12B is trained to understand both natural images and documents, achieving leading performance on various multimodal benchmarks, surpassing a number of larger models. Unlike many open-source models, Pixtral is also a cutting-edge text model for its size, and does not compromise on natural language performance to excel in multimodal tasks. Pixtral uses a new vision encoder trained from scratch, which allows it to ingest images at their natural resolution and aspect ratio. This gives users flexibility on the number of tokens used to process an image. Pixtral is also able to process any number of images in its long context window of 128K tokens. Pixtral 12B substanially outperforms other open models of similar sizes (Llama-3.2 11B \& Qwen-2-VL 7B). It also outperforms much larger open models like Llama-3.2 90B while being 7x smaller. We further contribute an open-source benchmark, MM-MT-Bench, for evaluating vision-language models in practical scenarios, and provide detailed analysis and code for standardized evaluation protocols for multimodal LLMs. Pixtral-12B is released under Apache 2.0 license. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07073v2-abstract-full').style.display = 'none'; document.getElementById('2410.07073v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05928">arXiv:2410.05928</a> <span> [<a href="https://arxiv.org/pdf/2410.05928">pdf</a>, <a href="https://arxiv.org/format/2410.05928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Beyond Captioning: Task-Specific Prompting for Improved VLM Performance in Mathematical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Singh%2C+A">Ayush Singh</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+M">Mansi Gupta</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shivank Garg</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Abhinav Kumar</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+V">Vansh Agrawal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05928v1-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) have transformed tasks requiring visual and reasoning abilities, such as image retrieval and Visual Question Answering (VQA). Despite their success, VLMs face significant challenges with tasks involving geometric reasoning, algebraic problem-solving, and counting. These limitations stem from difficulties effectively integrating multiple modalities and accurately inter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05928v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05928v1-abstract-full" style="display: none;"> Vision-Language Models (VLMs) have transformed tasks requiring visual and reasoning abilities, such as image retrieval and Visual Question Answering (VQA). Despite their success, VLMs face significant challenges with tasks involving geometric reasoning, algebraic problem-solving, and counting. These limitations stem from difficulties effectively integrating multiple modalities and accurately interpreting geometry-related tasks. Various works claim that introducing a captioning pipeline before VQA tasks enhances performance. We incorporated this pipeline for tasks involving geometry, algebra, and counting. We found that captioning results are not generalizable, specifically with larger VLMs primarily trained on downstream QnA tasks showing random performance on math-related challenges. However, we present a promising alternative: task-based prompting, enriching the prompt with task-specific guidance. This approach shows promise and proves more effective than direct captioning methods for math-heavy problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05928v1-abstract-full').style.display = 'none'; document.getElementById('2410.05928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05915">arXiv:2410.05915</a> <span> [<a href="https://arxiv.org/pdf/2410.05915">pdf</a>, <a href="https://arxiv.org/format/2410.05915">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Give me a hint: Can LLMs take a hint to solve math problems? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Agrawal%2C+V">Vansh Agrawal</a>, <a href="/search/cs?searchtype=author&query=Singla%2C+P">Pratham Singla</a>, <a href="/search/cs?searchtype=author&query=Miglani%2C+A+S">Amitoj Singh Miglani</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shivank Garg</a>, <a href="/search/cs?searchtype=author&query=Mangal%2C+A">Ayush Mangal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05915v2-abstract-short" style="display: inline;"> While state-of-the-art LLMs have shown poor logical and basic mathematical reasoning, recent works try to improve their problem-solving abilities using prompting techniques. We propose giving "hints" to improve the language model's performance on advanced mathematical problems, taking inspiration from how humans approach math pedagogically. We also test robustness to adversarial hints and demonstr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05915v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05915v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05915v2-abstract-full" style="display: none;"> While state-of-the-art LLMs have shown poor logical and basic mathematical reasoning, recent works try to improve their problem-solving abilities using prompting techniques. We propose giving "hints" to improve the language model's performance on advanced mathematical problems, taking inspiration from how humans approach math pedagogically. We also test robustness to adversarial hints and demonstrate their sensitivity to them. We demonstrate the effectiveness of our approach by evaluating various diverse LLMs, presenting them with a broad set of problems of different difficulties and topics from the MATH dataset and comparing against techniques such as one-shot, few-shot, and chain of thought prompting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05915v2-abstract-full').style.display = 'none'; document.getElementById('2410.05915v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04447">arXiv:2410.04447</a> <span> [<a href="https://arxiv.org/pdf/2410.04447">pdf</a>, <a href="https://arxiv.org/format/2410.04447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Attention Shift: Steering AI Away from Unsafe Content </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shivank Garg</a>, <a href="/search/cs?searchtype=author&query=Tiwari%2C+M">Manyana Tiwari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04447v1-abstract-short" style="display: inline;"> This study investigates the generation of unsafe or harmful content in state-of-the-art generative models, focusing on methods for restricting such generations. We introduce a novel training-free approach using attention reweighing to remove unsafe concepts without additional training during inference. We compare our method against existing ablation methods, evaluating the performance on both, dir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04447v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04447v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04447v1-abstract-full" style="display: none;"> This study investigates the generation of unsafe or harmful content in state-of-the-art generative models, focusing on methods for restricting such generations. We introduce a novel training-free approach using attention reweighing to remove unsafe concepts without additional training during inference. We compare our method against existing ablation methods, evaluating the performance on both, direct and adversarial jailbreak prompts, using qualitative and quantitative metrics. We hypothesize potential reasons for the observed results and discuss the limitations and broader implications of content restriction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04447v1-abstract-full').style.display = 'none'; document.getElementById('2410.04447v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02080">arXiv:2410.02080</a> <span> [<a href="https://arxiv.org/pdf/2410.02080">pdf</a>, <a href="https://arxiv.org/format/2410.02080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EMMA: Efficient Visual Alignment in Multi-Modal LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ghazanfari%2C+S">Sara Ghazanfari</a>, <a href="/search/cs?searchtype=author&query=Araujo%2C+A">Alexandre Araujo</a>, <a href="/search/cs?searchtype=author&query=Krishnamurthy%2C+P">Prashanth Krishnamurthy</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&query=Khorrami%2C+F">Farshad Khorrami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02080v1-abstract-short" style="display: inline;"> Multi-modal Large Language Models (MLLMs) have recently exhibited impressive general-purpose capabilities by leveraging vision foundation models to encode the core concepts of images into representations. These are then combined with instructions and processed by the language model to generate high-quality responses. Despite significant progress in enhancing the language component, challenges pers… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02080v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02080v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02080v1-abstract-full" style="display: none;"> Multi-modal Large Language Models (MLLMs) have recently exhibited impressive general-purpose capabilities by leveraging vision foundation models to encode the core concepts of images into representations. These are then combined with instructions and processed by the language model to generate high-quality responses. Despite significant progress in enhancing the language component, challenges persist in optimally fusing visual encodings within the language model for task-specific adaptability. Recent research has focused on improving this fusion through modality adaptation modules but at the cost of significantly increased model complexity and training data needs. In this paper, we propose EMMA (Efficient Multi-Modal Adaptation), a lightweight cross-modality module designed to efficiently fuse visual and textual encodings, generating instruction-aware visual representations for the language model. Our key contributions include: (1) an efficient early fusion mechanism that integrates vision and language representations with minimal added parameters (less than 0.2% increase in model size), (2) an in-depth interpretability analysis that sheds light on the internal mechanisms of the proposed method; (3) comprehensive experiments that demonstrate notable improvements on both specialized and general benchmarks for MLLMs. Empirical results show that EMMA boosts performance across multiple tasks by up to 9.3% while significantly improving robustness against hallucinations. Our code is available at https://github.com/SaraGhazanfari/EMMA <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02080v1-abstract-full').style.display = 'none'; document.getElementById('2410.02080v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19293">arXiv:2409.19293</a> <span> [<a href="https://arxiv.org/pdf/2409.19293">pdf</a>, <a href="https://arxiv.org/format/2409.19293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VLAD-BuFF: Burst-aware Fast Feature Aggregation for Visual Place Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khaliq%2C+A">Ahmad Khaliq</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Ming Xu</a>, <a href="/search/cs?searchtype=author&query=Hausler%2C+S">Stephen Hausler</a>, <a href="/search/cs?searchtype=author&query=Milford%2C+M">Michael Milford</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sourav Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19293v1-abstract-short" style="display: inline;"> Visual Place Recognition (VPR) is a crucial component of many visual localization pipelines for embodied agents. VPR is often formulated as an image retrieval task aimed at jointly learning local features and an aggregation method. The current state-of-the-art VPR methods rely on VLAD aggregation, which can be trained to learn a weighted contribution of features through their soft assignment to cl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19293v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19293v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19293v1-abstract-full" style="display: none;"> Visual Place Recognition (VPR) is a crucial component of many visual localization pipelines for embodied agents. VPR is often formulated as an image retrieval task aimed at jointly learning local features and an aggregation method. The current state-of-the-art VPR methods rely on VLAD aggregation, which can be trained to learn a weighted contribution of features through their soft assignment to cluster centers. However, this process has two key limitations. Firstly, the feature-to-cluster weighting does not account for over-represented repetitive structures within a cluster, e.g., shadows or window panes; this phenomenon is also referred to as the `burstiness' problem, classically solved by discounting repetitive features before aggregation. Secondly, feature to cluster comparisons are compute-intensive for state-of-the-art image encoders with high-dimensional local features. This paper addresses these limitations by introducing VLAD-BuFF with two novel contributions: i) a self-similarity based feature discounting mechanism to learn Burst-aware features within end-to-end VPR training, and ii) Fast Feature aggregation by reducing local feature dimensions specifically through PCA-initialized learnable pre-projection. We benchmark our method on 9 public datasets, where VLAD-BuFF sets a new state of the art. Our method is able to maintain its high recall even for 12x reduced local feature dimensions, thus enabling fast feature aggregation without compromising on recall. Through additional qualitative studies, we show how our proposed weighting method effectively downweights the non-distinctive features. Source code: https://github.com/Ahmedest61/VLAD-BuFF/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19293v1-abstract-full').style.display = 'none'; document.getElementById('2409.19293v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at ECCV 2024; Includes supplementary; 29 pages; 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18049">arXiv:2409.18049</a> <span> [<a href="https://arxiv.org/pdf/2409.18049">pdf</a>, <a href="https://arxiv.org/format/2409.18049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Revisit Anything: Visual Place Recognition via Image Segment Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+K">Kartik Garg</a>, <a href="/search/cs?searchtype=author&query=Puligilla%2C+S+S">Sai Shubodh Puligilla</a>, <a href="/search/cs?searchtype=author&query=Kolathaya%2C+S">Shishir Kolathaya</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+M">Madhava Krishna</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sourav Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18049v1-abstract-short" style="display: inline;"> Accurately recognizing a revisited place is crucial for embodied agents to localize and navigate. This requires visual representations to be distinct, despite strong variations in camera viewpoint and scene appearance. Existing visual place recognition pipelines encode the "whole" image and search for matches. This poses a fundamental challenge in matching two images of the same place captured fro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18049v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18049v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18049v1-abstract-full" style="display: none;"> Accurately recognizing a revisited place is crucial for embodied agents to localize and navigate. This requires visual representations to be distinct, despite strong variations in camera viewpoint and scene appearance. Existing visual place recognition pipelines encode the "whole" image and search for matches. This poses a fundamental challenge in matching two images of the same place captured from different camera viewpoints: "the similarity of what overlaps can be dominated by the dissimilarity of what does not overlap". We address this by encoding and searching for "image segments" instead of the whole images. We propose to use open-set image segmentation to decompose an image into `meaningful' entities (i.e., things and stuff). This enables us to create a novel image representation as a collection of multiple overlapping subgraphs connecting a segment with its neighboring segments, dubbed SuperSegment. Furthermore, to efficiently encode these SuperSegments into compact vector representations, we propose a novel factorized representation of feature aggregation. We show that retrieving these partial representations leads to significantly higher recognition recall than the typical whole image based retrieval. Our segments-based approach, dubbed SegVLAD, sets a new state-of-the-art in place recognition on a diverse selection of benchmark datasets, while being applicable to both generic and task-specialized image encoders. Finally, we demonstrate the potential of our method to ``revisit anything'' by evaluating our method on an object instance retrieval task, which bridges the two disparate areas of research: visual place recognition and object-goal navigation, through their common aim of recognizing goal objects specific to a place. Source code: https://github.com/AnyLoc/Revisit-Anything. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18049v1-abstract-full').style.display = 'none'; document.getElementById('2409.18049v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at ECCV 2024; Includes supplementary; 29 pages; 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16850">arXiv:2409.16850</a> <span> [<a href="https://arxiv.org/pdf/2409.16850">pdf</a>, <a href="https://arxiv.org/format/2409.16850">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robust Scene Change Detection Using Visual Foundation Models and Cross-Attention Mechanisms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chun-Jung Lin</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sourav Garg</a>, <a href="/search/cs?searchtype=author&query=Chin%2C+T">Tat-Jun Chin</a>, <a href="/search/cs?searchtype=author&query=Dayoub%2C+F">Feras Dayoub</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16850v2-abstract-short" style="display: inline;"> We present a novel method for scene change detection that leverages the robust feature extraction capabilities of a visual foundational model, DINOv2, and integrates full-image cross-attention to address key challenges such as varying lighting, seasonal variations, and viewpoint differences. In order to effectively learn correspondences and mis-correspondences between an image pair for the change… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16850v2-abstract-full').style.display = 'inline'; document.getElementById('2409.16850v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16850v2-abstract-full" style="display: none;"> We present a novel method for scene change detection that leverages the robust feature extraction capabilities of a visual foundational model, DINOv2, and integrates full-image cross-attention to address key challenges such as varying lighting, seasonal variations, and viewpoint differences. In order to effectively learn correspondences and mis-correspondences between an image pair for the change detection task, we propose to a) ``freeze'' the backbone in order to retain the generality of dense foundation features, and b) employ ``full-image'' cross-attention to better tackle the viewpoint variations between the image pair. We evaluate our approach on two benchmark datasets, VL-CMU-CD and PSCD, along with their viewpoint-varied versions. Our experiments demonstrate significant improvements in F1-score, particularly in scenarios involving geometric changes between image pairs. The results indicate our method's superior generalization capabilities over existing state-of-the-art approaches, showing robustness against photometric and geometric variations as well as better overall generalization when fine-tuned to adapt to new environments. Detailed ablation studies further validate the contributions of each component in our architecture. Our source code is available at: https://github.com/ChadLin9596/Robust-Scene-Change-Detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16850v2-abstract-full').style.display = 'none'; document.getElementById('2409.16850v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10161">arXiv:2409.10161</a> <span> [<a href="https://arxiv.org/pdf/2409.10161">pdf</a>, <a href="https://arxiv.org/format/2409.10161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SplatSim: Zero-Shot Sim2Real Transfer of RGB Manipulation Policies Using Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qureshi%2C+M+N">Mohammad Nomaan Qureshi</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sparsh Garg</a>, <a href="/search/cs?searchtype=author&query=Yandun%2C+F">Francisco Yandun</a>, <a href="/search/cs?searchtype=author&query=Held%2C+D">David Held</a>, <a href="/search/cs?searchtype=author&query=Kantor%2C+G">George Kantor</a>, <a href="/search/cs?searchtype=author&query=Silwal%2C+A">Abhisesh Silwal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10161v3-abstract-short" style="display: inline;"> Sim2Real transfer, particularly for manipulation policies relying on RGB images, remains a critical challenge in robotics due to the significant domain shift between synthetic and real-world visual data. In this paper, we propose SplatSim, a novel framework that leverages Gaussian Splatting as the primary rendering primitive to reduce the Sim2Real gap for RGB-based manipulation policies. By replac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10161v3-abstract-full').style.display = 'inline'; document.getElementById('2409.10161v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10161v3-abstract-full" style="display: none;"> Sim2Real transfer, particularly for manipulation policies relying on RGB images, remains a critical challenge in robotics due to the significant domain shift between synthetic and real-world visual data. In this paper, we propose SplatSim, a novel framework that leverages Gaussian Splatting as the primary rendering primitive to reduce the Sim2Real gap for RGB-based manipulation policies. By replacing traditional mesh representations with Gaussian Splats in simulators, SplatSim produces highly photorealistic synthetic data while maintaining the scalability and cost-efficiency of simulation. We demonstrate the effectiveness of our framework by training manipulation policies within SplatSim and deploying them in the real world in a zero-shot manner, achieving an average success rate of 86.25%, compared to 97.5% for policies trained on real-world data. Videos can be found on our project page: https://splatsim.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10161v3-abstract-full').style.display = 'none'; document.getElementById('2409.10161v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08179">arXiv:2408.08179</a> <span> [<a href="https://arxiv.org/pdf/2408.08179">pdf</a>, <a href="https://arxiv.org/format/2408.08179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Machine learning empowered Modulation detection for OFDM-based signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pourranjbar%2C+A">Ali Pourranjbar</a>, <a href="/search/cs?searchtype=author&query=Kaddoum%2C+G">Georges Kaddoum</a>, <a href="/search/cs?searchtype=author&query=Mba%2C+V+A">Verdier Assoume Mba</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sahil Garg</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">Satinder Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08179v1-abstract-short" style="display: inline;"> We propose a blind ML-based modulation detection for OFDM-based technologies. Unlike previous works that assume an ideal environment with precise knowledge of subcarrier count and cyclic prefix location, we consider blind modulation detection while accounting for realistic environmental parameters and imperfections. Our approach employs a ResNet network to simultaneously detect the modulation type… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08179v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08179v1-abstract-full" style="display: none;"> We propose a blind ML-based modulation detection for OFDM-based technologies. Unlike previous works that assume an ideal environment with precise knowledge of subcarrier count and cyclic prefix location, we consider blind modulation detection while accounting for realistic environmental parameters and imperfections. Our approach employs a ResNet network to simultaneously detect the modulation type and accurately locate the cyclic prefix. Specifically, after eliminating the environmental impact from the signal and accurately extracting the OFDM symbols, we convert these symbols into scatter plots. Due to their unique shapes, these scatter plots are then classified using ResNet. As a result, our proposed modulation classification method can be applied to any OFDM-based technology without prior knowledge of the transmitted signal. We evaluate its performance across various modulation schemes and subcarrier numbers. Simulation results show that our method achieves a modulation detection accuracy exceeding $80\%$ at an SNR of $10$ dB and $95\%$ at an SNR of $25$ dB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08179v1-abstract-full').style.display = 'none'; document.getElementById('2408.08179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05890">arXiv:2408.05890</a> <span> [<a href="https://arxiv.org/pdf/2408.05890">pdf</a>, <a href="https://arxiv.org/format/2408.05890">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3656019.3676898">10.1145/3656019.3676898 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SZKP: A Scalable Accelerator Architecture for Zero-Knowledge Proofs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Daftardar%2C+A">Alhad Daftardar</a>, <a href="/search/cs?searchtype=author&query=Reagen%2C+B">Brandon Reagen</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05890v1-abstract-short" style="display: inline;"> Zero-Knowledge Proofs (ZKPs) are an emergent paradigm in verifiable computing. In the context of applications like cloud computing, ZKPs can be used by a client (called the verifier) to verify the service provider (called the prover) is in fact performing the correct computation based on a public input. A recently prominent variant of ZKPs is zkSNARKs, generating succinct proofs that can be rapidl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05890v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05890v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05890v1-abstract-full" style="display: none;"> Zero-Knowledge Proofs (ZKPs) are an emergent paradigm in verifiable computing. In the context of applications like cloud computing, ZKPs can be used by a client (called the verifier) to verify the service provider (called the prover) is in fact performing the correct computation based on a public input. A recently prominent variant of ZKPs is zkSNARKs, generating succinct proofs that can be rapidly verified by the end user. However, proof generation itself is very time consuming per transaction. Two key primitives in proof generation are the Number Theoretic Transform (NTT) and Multi-scalar Multiplication (MSM). These primitives are prime candidates for hardware acceleration, and prior works have looked at GPU implementations and custom RTL. However, both algorithms involve complex dataflow patterns -- standard NTTs have irregular memory accesses for butterfly computations from stage to stage, and MSMs using Pippenger's algorithm have data-dependent memory accesses for partial sum calculations. We present SZKP, a scalable accelerator framework that is the first ASIC to accelerate an entire proof on-chip by leveraging structured dataflows for both NTTs and MSMs. SZKP achieves conservative full-proof speedups of over 400$\times$, 3$\times$, and 12$\times$ over CPU, ASIC, and GPU implementations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05890v1-abstract-full').style.display = 'none'; document.getElementById('2408.05890v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the 33rd International Conference on Parallel Architectures and Compilation Techniques (PACT), 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00118">arXiv:2408.00118</a> <span> [<a href="https://arxiv.org/pdf/2408.00118">pdf</a>, <a href="https://arxiv.org/format/2408.00118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gemma 2: Improving Open Language Models at a Practical Size </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemma+Team"> Gemma Team</a>, <a href="/search/cs?searchtype=author&query=Riviere%2C+M">Morgane Riviere</a>, <a href="/search/cs?searchtype=author&query=Pathak%2C+S">Shreya Pathak</a>, <a href="/search/cs?searchtype=author&query=Sessa%2C+P+G">Pier Giuseppe Sessa</a>, <a href="/search/cs?searchtype=author&query=Hardin%2C+C">Cassidy Hardin</a>, <a href="/search/cs?searchtype=author&query=Bhupatiraju%2C+S">Surya Bhupatiraju</a>, <a href="/search/cs?searchtype=author&query=Hussenot%2C+L">L茅onard Hussenot</a>, <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Shahriari%2C+B">Bobak Shahriari</a>, <a href="/search/cs?searchtype=author&query=Ram%C3%A9%2C+A">Alexandre Ram茅</a>, <a href="/search/cs?searchtype=author&query=Ferret%2C+J">Johan Ferret</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peter Liu</a>, <a href="/search/cs?searchtype=author&query=Tafti%2C+P">Pouya Tafti</a>, <a href="/search/cs?searchtype=author&query=Friesen%2C+A">Abe Friesen</a>, <a href="/search/cs?searchtype=author&query=Casbon%2C+M">Michelle Casbon</a>, <a href="/search/cs?searchtype=author&query=Ramos%2C+S">Sabela Ramos</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+R">Ravin Kumar</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+C+L">Charline Le Lan</a>, <a href="/search/cs?searchtype=author&query=Jerome%2C+S">Sammy Jerome</a>, <a href="/search/cs?searchtype=author&query=Tsitsulin%2C+A">Anton Tsitsulin</a>, <a href="/search/cs?searchtype=author&query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&query=Stanczyk%2C+P">Piotr Stanczyk</a>, <a href="/search/cs?searchtype=author&query=Girgin%2C+S">Sertan Girgin</a>, <a href="/search/cs?searchtype=author&query=Momchev%2C+N">Nikola Momchev</a>, <a href="/search/cs?searchtype=author&query=Hoffman%2C+M">Matt Hoffman</a> , et al. (173 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00118v3-abstract-short" style="display: inline;"> In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We al… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00118v3-abstract-full').style.display = 'inline'; document.getElementById('2408.00118v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00118v3-abstract-full" style="display: none;"> In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We also train the 2B and 9B models with knowledge distillation (Hinton et al., 2015) instead of next token prediction. The resulting models deliver the best performance for their size, and even offer competitive alternatives to models that are 2-3 times bigger. We release all our models to the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00118v3-abstract-full').style.display = 'none'; document.getElementById('2408.00118v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20438">arXiv:2407.20438</a> <span> [<a href="https://arxiv.org/pdf/2407.20438">pdf</a>, <a href="https://arxiv.org/format/2407.20438">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generating Gender Alternatives in Machine Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+S">Sarthak Garg</a>, <a href="/search/cs?searchtype=author&query=Gheini%2C+M">Mozhdeh Gheini</a>, <a href="/search/cs?searchtype=author&query=Emmanuel%2C+C">Clara Emmanuel</a>, <a href="/search/cs?searchtype=author&query=Likhomanenko%2C+T">Tatiana Likhomanenko</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qin Gao</a>, <a href="/search/cs?searchtype=author&query=Paulik%2C+M">Matthias Paulik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20438v1-abstract-short" style="display: inline;"> Machine translation (MT) systems often translate terms with ambiguous gender (e.g., English term "the nurse") into the gendered form that is most prevalent in the systems' training data (e.g., "enfermera", the Spanish term for a female nurse). This often reflects and perpetuates harmful stereotypes present in society. With MT user interfaces in mind that allow for resolving gender ambiguity in a f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20438v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20438v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20438v1-abstract-full" style="display: none;"> Machine translation (MT) systems often translate terms with ambiguous gender (e.g., English term "the nurse") into the gendered form that is most prevalent in the systems' training data (e.g., "enfermera", the Spanish term for a female nurse). This often reflects and perpetuates harmful stereotypes present in society. With MT user interfaces in mind that allow for resolving gender ambiguity in a frictionless manner, we study the problem of generating all grammatically correct gendered translation alternatives. We open source train and test datasets for five language pairs and establish benchmarks for this task. Our key technical contribution is a novel semi-supervised solution for generating alternatives that integrates seamlessly with standard MT models and maintains high performance without requiring additional components or increasing inference overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20438v1-abstract-full').style.display = 'none'; document.getElementById('2407.20438v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">GeBNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18276">arXiv:2407.18276</a> <span> [<a href="https://arxiv.org/pdf/2407.18276">pdf</a>, <a href="https://arxiv.org/format/2407.18276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rome was Not Built in a Single Step: Hierarchical Prompting for LLM-based Chip Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nakkab%2C+A">Andre Nakkab</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S+Q">Sai Qian Zhang</a>, <a href="/search/cs?searchtype=author&query=Karri%2C+R">Ramesh Karri</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18276v3-abstract-short" style="display: inline;"> Large Language Models (LLMs) are effective in computer hardware synthesis via hardware description language (HDL) generation. However, LLM-assisted approaches for HDL generation struggle when handling complex tasks. We introduce a suite of hierarchical prompting techniques which facilitate efficient stepwise design methods, and develop a generalizable automation pipeline for the process. To evalua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18276v3-abstract-full').style.display = 'inline'; document.getElementById('2407.18276v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18276v3-abstract-full" style="display: none;"> Large Language Models (LLMs) are effective in computer hardware synthesis via hardware description language (HDL) generation. However, LLM-assisted approaches for HDL generation struggle when handling complex tasks. We introduce a suite of hierarchical prompting techniques which facilitate efficient stepwise design methods, and develop a generalizable automation pipeline for the process. To evaluate these techniques, we present a benchmark set of hardware designs which have solutions with or without architectural hierarchy. Using these benchmarks, we compare various open-source and proprietary LLMs, including our own fine-tuned Code Llama-Verilog model. Our hierarchical methods automatically produce successful designs for complex hardware modules that standard flat prompting methods cannot achieve, allowing smaller open-source LLMs to compete with large proprietary models. Hierarchical prompting reduces HDL generation time and yields savings on LLM costs. Our experiments detail which LLMs are capable of which applications, and how to apply hierarchical methods in various modes. We explore case studies of generating complex cores using automatic scripted hierarchical prompts, including the first-ever LLM-designed processor with no human feedback. Tools for the Recurrent Optimization via Machine Editing (ROME) method can be found at https://github.com/ajn313/ROME-LLM <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18276v3-abstract-full').style.display = 'none'; document.getElementById('2407.18276v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at MLCAD '24. 10 pages, 7 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16119">arXiv:2407.16119</a> <span> [<a href="https://arxiv.org/pdf/2407.16119">pdf</a>, <a href="https://arxiv.org/format/2407.16119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Aware Deep Neural Representations for Visual Analysis of Vector Field Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Atul Kumar</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&query=Dutta%2C+S">Soumya Dutta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16119v2-abstract-short" style="display: inline;"> The widespread use of Deep Neural Networks (DNNs) has recently resulted in their application to challenging scientific visualization tasks. While advanced DNNs demonstrate impressive generalization abilities, understanding factors like prediction quality, confidence, robustness, and uncertainty is crucial. These insights aid application scientists in making informed decisions. However, DNNs lack i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16119v2-abstract-full').style.display = 'inline'; document.getElementById('2407.16119v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16119v2-abstract-full" style="display: none;"> The widespread use of Deep Neural Networks (DNNs) has recently resulted in their application to challenging scientific visualization tasks. While advanced DNNs demonstrate impressive generalization abilities, understanding factors like prediction quality, confidence, robustness, and uncertainty is crucial. These insights aid application scientists in making informed decisions. However, DNNs lack inherent mechanisms to measure prediction uncertainty, prompting the creation of distinct frameworks for constructing robust uncertainty-aware models tailored to various visualization tasks. In this work, we develop uncertainty-aware implicit neural representations to model steady-state vector fields effectively. We comprehensively evaluate the efficacy of two principled deep uncertainty estimation techniques: (1) Deep Ensemble and (2) Monte Carlo Dropout, aimed at enabling uncertainty-informed visual analysis of features within steady vector field data. Our detailed exploration using several vector data sets indicate that uncertainty-aware models generate informative visualization results of vector field features. Furthermore, incorporating prediction uncertainty improves the resilience and interpretability of our DNN model, rendering it applicable for the analysis of non-trivial vector field data sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16119v2-abstract-full').style.display = 'none'; document.getElementById('2407.16119v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at IEEE Visualization 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02929">arXiv:2407.02929</a> <span> [<a href="https://arxiv.org/pdf/2407.02929">pdf</a>, <a href="https://arxiv.org/format/2407.02929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> A Hybrid Reactive Routing Protocol for Decentralized UAV Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+S">Shivam Garg</a>, <a href="/search/cs?searchtype=author&query=Ihler%2C+A">Alexander Ihler</a>, <a href="/search/cs?searchtype=author&query=Bentley%2C+E+S">Elizabeth Serena Bentley</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+S">Sunil Kumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02929v2-abstract-short" style="display: inline;"> Wireless networks consisting of low SWaP, FW-UAVs are used in many applications, such as monitoring, search and surveillance of inaccessible areas. A decentralized and autonomous approach ensures robustness to failures; the UAVs explore and sense within the area and forward their information, in a multihop manner, to nearby aerial gateway nodes. However, the unpredictable nature of the events, rel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02929v2-abstract-full').style.display = 'inline'; document.getElementById('2407.02929v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02929v2-abstract-full" style="display: none;"> Wireless networks consisting of low SWaP, FW-UAVs are used in many applications, such as monitoring, search and surveillance of inaccessible areas. A decentralized and autonomous approach ensures robustness to failures; the UAVs explore and sense within the area and forward their information, in a multihop manner, to nearby aerial gateway nodes. However, the unpredictable nature of the events, relatively high speed of UAVs, and dynamic UAV trajectories cause the network topology to change significantly over time, resulting in frequent route breaks. A holistic routing approach is needed to support multiple traffic flows in these networks to provide mobility- and congestion-aware, high-quality routes when needed, with low control and computational overheads, using the information collected in a distributed manner. Existing routing schemes do not address all the mentioned issues. We present a hybrid reactive routing protocol for decentralized UAV networks. Our scheme searches routes on-demand, monitors a region around the selected route (the pipe), and proactively switches to an alternative route before the current route's quality degrades below a threshold. We empirically evaluate the impact of pipe width and node density on our ability to find alternate high-quality routes within the pipe and the overhead required to maintain the pipe. Compared to existing reactive routing schemes, our approach achieves higher throughput and reduces the number of route discoveries, overhead, and resulting flow interruptions at different traffic loads, node densities and speeds. Despite having limited network topology information, and low overhead and route computation complexity, our proposed scheme achieves superior throughput to proactive optimized link state routing scheme at different network and traffic settings. We also evaluate the relative performance of reactive and proactive routing schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02929v2-abstract-full').style.display = 'none'; document.getElementById('2407.02929v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19549">arXiv:2406.19549</a> <span> [<a href="https://arxiv.org/pdf/2406.19549">pdf</a>, <a href="https://arxiv.org/format/2406.19549">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ASCENT: Amplifying Power Side-Channel Resilience via Learning & Monte-Carlo Tree Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhandari%2C+J">Jitendra Bhandari</a>, <a href="/search/cs?searchtype=author&query=Chowdhury%2C+A+B">Animesh Basak Chowdhury</a>, <a href="/search/cs?searchtype=author&query=Nabeel%2C+M">Mohammed Nabeel</a>, <a href="/search/cs?searchtype=author&query=Sinanoglu%2C+O">Ozgur Sinanoglu</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&query=Karri%2C+R">Ramesh Karri</a>, <a href="/search/cs?searchtype=author&query=Knechtel%2C+J">Johann Knechtel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19549v2-abstract-short" style="display: inline;"> Power side-channel (PSC) analysis is pivotal for securing cryptographic hardware. Prior art focused on securing gate-level netlists obtained as-is from chip design automation, neglecting all the complexities and potential side-effects for security arising from the design automation process. That is, automation traditionally prioritizes power, performance, and area (PPA), sidelining security. We pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19549v2-abstract-full').style.display = 'inline'; document.getElementById('2406.19549v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19549v2-abstract-full" style="display: none;"> Power side-channel (PSC) analysis is pivotal for securing cryptographic hardware. Prior art focused on securing gate-level netlists obtained as-is from chip design automation, neglecting all the complexities and potential side-effects for security arising from the design automation process. That is, automation traditionally prioritizes power, performance, and area (PPA), sidelining security. We propose a "security-first" approach, refining the logic synthesis stage to enhance the overall resilience of PSC countermeasures. We introduce ASCENT, a learning-and-search-based framework that (i) drastically reduces the time for post-design PSC evaluation and (ii) explores the security-vs-PPA design space. Thus, ASCENT enables an efficient exploration of a large number of candidate netlists, leading to an improvement in PSC resilience compared to regular PPA-optimized netlists. ASCENT is up to 120x faster than traditional PSC analysis and yields a 3.11x improvement for PSC resilience of state-of-the-art PSC countermeasures <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19549v2-abstract-full').style.display = 'none'; document.getElementById('2406.19549v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at 2024 ACM/IEEE International Conference on Computer-Aided Design</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17132">arXiv:2406.17132</a> <span> [<a href="https://arxiv.org/pdf/2406.17132">pdf</a>, <a href="https://arxiv.org/format/2406.17132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> LLM-Aided Testbench Generation and Bug Detection for Finite-State Machines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhandari%2C+J">Jitendra Bhandari</a>, <a href="/search/cs?searchtype=author&query=Knechtel%2C+J">Johann Knechtel</a>, <a href="/search/cs?searchtype=author&query=Narayanaswamy%2C+R">Ramesh Narayanaswamy</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&query=Karri%2C+R">Ramesh Karri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17132v1-abstract-short" style="display: inline;"> This work investigates the potential of tailoring Large Language Models (LLMs), specifically GPT3.5 and GPT4, for the domain of chip testing. A key aspect of chip design is functional testing, which relies on testbenches to evaluate the functionality and coverage of Register-Transfer Level (RTL) designs. We aim to enhance testbench generation by incorporating feedback from commercial-grade Electro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17132v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17132v1-abstract-full" style="display: none;"> This work investigates the potential of tailoring Large Language Models (LLMs), specifically GPT3.5 and GPT4, for the domain of chip testing. A key aspect of chip design is functional testing, which relies on testbenches to evaluate the functionality and coverage of Register-Transfer Level (RTL) designs. We aim to enhance testbench generation by incorporating feedback from commercial-grade Electronic Design Automation (EDA) tools into LLMs. Through iterative feedback from these tools, we refine the testbenches to achieve improved test coverage. Our case studies present promising results, demonstrating that this approach can effectively enhance test coverage. By integrating EDA tool feedback, the generated testbenches become more accurate in identifying potential issues in the RTL design. Furthermore, we extended our study to use this enhanced test coverage framework for detecting bugs in the RTL implementations <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17132v1-abstract-full').style.display = 'none'; document.getElementById('2406.17132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Garg%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Garg%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Garg%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Garg%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Garg%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Garg%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>