CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 97 results for author: <span class="mathjax">Hegde, C</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Hegde%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Hegde, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Hegde%2C+C&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Hegde, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Hegde%2C+C&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Hegde%2C+C&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Hegde%2C+C&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00922">arXiv:2502.00922</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.00922">pdf</a>, <a href="https://arxiv.org/format/2502.00922">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Huff-LLM: End-to-End Lossless Compression for Efficient LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yubeaton%2C+P">Patrick Yubeaton</a>, <a href="/search/cs?searchtype=author&amp;query=Mahmoud%2C+T">Tareq Mahmoud</a>, <a href="/search/cs?searchtype=author&amp;query=Naga%2C+S">Shehab Naga</a>, <a href="/search/cs?searchtype=author&amp;query=Taheri%2C+P">Pooria Taheri</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+T">Tianhua Xia</a>, <a href="/search/cs?searchtype=author&amp;query=George%2C+A">Arun George</a>, <a href="/search/cs?searchtype=author&amp;query=Khalil%2C+Y">Yasmein Khalil</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S+Q">Sai Qian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+S">Siddharth Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Garg%2C+S">Siddharth Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00922v1-abstract-short" style="display: inline;"> As they become more capable, large language models (LLMs) have continued to rapidly increase in size. This has exacerbated the difficulty in running state of the art LLMs on small, edge devices. Standard techniques advocate solving this problem through lossy compression techniques such as quantization or pruning. However, such compression techniques are lossy, and have been shown to change model b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00922v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00922v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00922v1-abstract-full" style="display: none;"> As they become more capable, large language models (LLMs) have continued to rapidly increase in size. This has exacerbated the difficulty in running state of the art LLMs on small, edge devices. Standard techniques advocate solving this problem through lossy compression techniques such as quantization or pruning. However, such compression techniques are lossy, and have been shown to change model behavior in unpredictable manners. We propose Huff-LLM, an \emph{end-to-end, lossless} model compression method that lets users store LLM weights in compressed format \emph{everywhere} -- cloud, disk, main memory, and even in on-chip memory/buffers. This allows us to not only load larger models in main memory, but also reduces bandwidth required to load weights on chip, and makes more efficient use of on-chip weight buffers. In addition to the memory savings achieved via compression, we also show latency and energy efficiency improvements when performing inference with the compressed model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00922v1-abstract-full').style.display = 'none'; document.getElementById('2502.00922v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18880">arXiv:2501.18880</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.18880">pdf</a>, <a href="https://arxiv.org/format/2501.18880">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RLS3: RL-Based Synthetic Sample Selection to Enhance Spatial Reasoning in Vision-Language Models for Indoor Autonomous Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Waite%2C+J+R">Joshua R. Waite</a>, <a href="/search/cs?searchtype=author&amp;query=Hasan%2C+M+Z">Md. Zahid Hasan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qisai Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zhanhong Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">Soumik Sarkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18880v1-abstract-short" style="display: inline;"> Vision-language model (VLM) fine-tuning for application-specific visual grounding based on natural language instructions has become one of the most popular approaches for learning-enabled autonomous systems. However, such fine-tuning relies heavily on high-quality datasets to achieve successful performance in various downstream tasks. Additionally, VLMs often encounter limitations due to insuffici&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18880v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18880v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18880v1-abstract-full" style="display: none;"> Vision-language model (VLM) fine-tuning for application-specific visual grounding based on natural language instructions has become one of the most popular approaches for learning-enabled autonomous systems. However, such fine-tuning relies heavily on high-quality datasets to achieve successful performance in various downstream tasks. Additionally, VLMs often encounter limitations due to insufficient and imbalanced fine-tuning data. To address these issues, we propose a new generalizable framework to improve VLM fine-tuning by integrating it with a reinforcement learning (RL) agent. Our method utilizes the RL agent to manipulate objects within an indoor setting to create synthetic data for fine-tuning to address certain vulnerabilities of the VLM. Specifically, we use the performance of the VLM to provide feedback to the RL agent to generate informative data that efficiently fine-tune the VLM over the targeted task (e.g. spatial reasoning). The key contribution of this work is developing a framework where the RL agent serves as an informative data sampling tool and assists the VLM in order to enhance performance and address task-specific vulnerabilities. By targeting the data sampling process to address the weaknesses of the VLM, we can effectively train a more context-aware model. In addition, generating synthetic data allows us to have precise control over each scene and generate granular ground truth captions. Our results show that the proposed data generation approach improves the spatial reasoning performance of VLMs, which demonstrates the benefits of using RL-guided data generation in vision-language tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18880v1-abstract-full').style.display = 'none'; document.getElementById('2501.18880v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCPS 2025 accepted paper, 10 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18511">arXiv:2501.18511</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.18511">pdf</a>, <a href="https://arxiv.org/format/2501.18511">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> WILDCHAT-50M: A Deep Dive Into the Role of Synthetic Data in Post-Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18511v1-abstract-short" style="display: inline;"> Language model (LLM) post-training, from DPO to distillation, can refine behaviors and unlock new skills, but the open science supporting these post-training techniques is still in its infancy. One limiting factor has been the difficulty of conducting large-scale comparative analyses of synthetic data generating models and LLM judges. To close this gap, we introduce WILDCHAT-50M, the largest publi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18511v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18511v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18511v1-abstract-full" style="display: none;"> Language model (LLM) post-training, from DPO to distillation, can refine behaviors and unlock new skills, but the open science supporting these post-training techniques is still in its infancy. One limiting factor has been the difficulty of conducting large-scale comparative analyses of synthetic data generating models and LLM judges. To close this gap, we introduce WILDCHAT-50M, the largest public chat dataset to date. We extend the existing WildChat dataset to include responses not only from GPT, but from over 50 different open-weight models, ranging in size from 0.5B to 104B parameters. We conduct an extensive comparative analysis and demonstrate the potential of this dataset by creating RE-WILD, our own public SFT mix, which outperforms the recent Tulu-3 SFT mixture from Allen AI with only 40% as many samples. Our dataset, samples and code are available at https://github.com/penfever/wildchat-50m. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18511v1-abstract-full').style.display = 'none'; document.getElementById('2501.18511v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01453">arXiv:2501.01453</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.01453">pdf</a>, <a href="https://arxiv.org/format/2501.01453">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> </div> </div> <p class="title is-5 mathjax"> Geometry Matters: Benchmarking Scientific ML Approaches for Flow Prediction around Complex Geometries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rabeh%2C+A">Ali Rabeh</a>, <a href="/search/cs?searchtype=author&amp;query=Herron%2C+E">Ethan Herron</a>, <a href="/search/cs?searchtype=author&amp;query=Balu%2C+A">Aditya Balu</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">Soumik Sarkar</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Krishnamurthy%2C+A">Adarsh Krishnamurthy</a>, <a href="/search/cs?searchtype=author&amp;query=Ganapathysubramanian%2C+B">Baskar Ganapathysubramanian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01453v1-abstract-short" style="display: inline;"> Rapid yet accurate simulations of fluid dynamics around complex geometries is critical in a variety of engineering and scientific applications, including aerodynamics and biomedical flows. However, while scientific machine learning (SciML) has shown promise, most studies are constrained to simple geometries, leaving complex, real-world scenarios underexplored. This study addresses this gap by benc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01453v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01453v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01453v1-abstract-full" style="display: none;"> Rapid yet accurate simulations of fluid dynamics around complex geometries is critical in a variety of engineering and scientific applications, including aerodynamics and biomedical flows. However, while scientific machine learning (SciML) has shown promise, most studies are constrained to simple geometries, leaving complex, real-world scenarios underexplored. This study addresses this gap by benchmarking diverse SciML models, including neural operators and vision transformer-based foundation models, for fluid flow prediction over intricate geometries. Using a high-fidelity dataset of steady-state flows across various geometries, we evaluate the impact of geometric representations -- Signed Distance Fields (SDF) and binary masks -- on model accuracy, scalability, and generalization. Central to this effort is the introduction of a novel, unified scoring framework that integrates metrics for global accuracy, boundary layer fidelity, and physical consistency to enable a robust, comparative evaluation of model performance. Our findings demonstrate that foundation models significantly outperform neural operators, particularly in data-limited scenarios, and that SDF representations yield superior results with sufficient training data. Despite these advancements, all models struggle with out-of-distribution generalization, highlighting a critical challenge for future SciML applications. By advancing both evaluation methodologies and modeling capabilities, this work paves the way for robust and scalable ML solutions for fluid dynamics across complex geometries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01453v1-abstract-full').style.display = 'none'; document.getElementById('2501.01453v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18696">arXiv:2412.18696</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.18696">pdf</a>, <a href="https://arxiv.org/format/2412.18696">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> STITCH: Surface reconstrucTion using Implicit neural representations with Topology Constraints and persistent Homology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jignasu%2C+A">Anushrut Jignasu</a>, <a href="/search/cs?searchtype=author&amp;query=Herron%2C+E">Ethan Herron</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zhanhong Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">Soumik Sarkar</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Ganapathysubramanian%2C+B">Baskar Ganapathysubramanian</a>, <a href="/search/cs?searchtype=author&amp;query=Balu%2C+A">Aditya Balu</a>, <a href="/search/cs?searchtype=author&amp;query=Krishnamurthy%2C+A">Adarsh Krishnamurthy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18696v2-abstract-short" style="display: inline;"> We present STITCH, a novel approach for neural implicit surface reconstruction of a sparse and irregularly spaced point cloud while enforcing topological constraints (such as having a single connected component). We develop a new differentiable framework based on persistent homology to formulate topological loss terms that enforce the prior of a single 2-manifold object. Our method demonstrates ex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18696v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18696v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18696v2-abstract-full" style="display: none;"> We present STITCH, a novel approach for neural implicit surface reconstruction of a sparse and irregularly spaced point cloud while enforcing topological constraints (such as having a single connected component). We develop a new differentiable framework based on persistent homology to formulate topological loss terms that enforce the prior of a single 2-manifold object. Our method demonstrates excellent performance in preserving the topology of complex 3D geometries, evident through both visual and empirical comparisons. We supplement this with a theoretical analysis, and provably show that optimizing the loss with stochastic (sub)gradient descent leads to convergence and enables reconstructing shapes with a single connected component. Our approach showcases the integration of differentiable topological data analysis tools for implicit surface reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18696v2-abstract-full').style.display = 'none'; document.getElementById('2412.18696v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 12 figures, 29 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17998">arXiv:2412.17998</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.17998">pdf</a>, <a href="https://arxiv.org/format/2412.17998">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696410.3714810">10.1145/3696410.3714810 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> WavePulse: Real-time Content Analytics of Radio Livestreams </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mittal%2C+G">Govind Mittal</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+S">Sarthak Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Wagle%2C+S">Shruti Wagle</a>, <a href="/search/cs?searchtype=author&amp;query=Chopra%2C+C">Chirag Chopra</a>, <a href="/search/cs?searchtype=author&amp;query=DeMattee%2C+A+J">Anthony J DeMattee</a>, <a href="/search/cs?searchtype=author&amp;query=Memon%2C+N">Nasir Memon</a>, <a href="/search/cs?searchtype=author&amp;query=Ahamad%2C+M">Mustaque Ahamad</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17998v2-abstract-short" style="display: inline;"> Radio remains a pervasive medium for mass information dissemination, with AM/FM stations reaching more Americans than either smartphone-based social networking or live television. Increasingly, radio broadcasts are also streamed online and accessed over the Internet. We present WavePulse, a framework that records, documents, and analyzes radio content in real-time. While our framework is generally&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17998v2-abstract-full').style.display = 'inline'; document.getElementById('2412.17998v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17998v2-abstract-full" style="display: none;"> Radio remains a pervasive medium for mass information dissemination, with AM/FM stations reaching more Americans than either smartphone-based social networking or live television. Increasingly, radio broadcasts are also streamed online and accessed over the Internet. We present WavePulse, a framework that records, documents, and analyzes radio content in real-time. While our framework is generally applicable, we showcase the efficacy of WavePulse in a collaborative project with a team of political scientists focusing on the 2024 Presidential Elections. We use WavePulse to monitor livestreams of 396 news radio stations over a period of three months, processing close to 500,000 hours of audio streams. These streams were converted into time-stamped, diarized transcripts and analyzed to track answer key political science questions at both the national and state levels. Our analysis revealed how local issues interacted with national trends, providing insights into information flow. Our results demonstrate WavePulse&#39;s efficacy in capturing and analyzing content from radio livestreams sourced from the Web. Code and dataset can be accessed at \url{https://wave-pulse.io}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17998v2-abstract-full').style.display = 'none'; document.getElementById('2412.17998v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at The Web Conference (WWW) 2025. 20 Pages, 24 figures. Access code and dataset at https://wave-pulse.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04653">arXiv:2412.04653</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.04653">pdf</a>, <a href="https://arxiv.org/format/2412.04653">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Hidden in the Noise: Two-Stage Robust Watermarking for Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Arabi%2C+K">Kasra Arabi</a>, <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Witter%2C+R+T">R. Teal Witter</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+N">Niv Cohen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04653v3-abstract-short" style="display: inline;"> As the quality of image generators continues to improve, deepfakes become a topic of considerable societal debate. Image watermarking allows responsible model owners to detect and label their AI-generated content, which can mitigate the harm. Yet, current state-of-the-art methods in image watermarking remain vulnerable to forgery and removal attacks. This vulnerability occurs in part because water&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04653v3-abstract-full').style.display = 'inline'; document.getElementById('2412.04653v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04653v3-abstract-full" style="display: none;"> As the quality of image generators continues to improve, deepfakes become a topic of considerable societal debate. Image watermarking allows responsible model owners to detect and label their AI-generated content, which can mitigate the harm. Yet, current state-of-the-art methods in image watermarking remain vulnerable to forgery and removal attacks. This vulnerability occurs in part because watermarks distort the distribution of generated images, unintentionally revealing information about the watermarking techniques. In this work, we first demonstrate a distortion-free watermarking method for images, based on a diffusion model&#39;s initial noise. However, detecting the watermark requires comparing the initial noise reconstructed for an image to all previously used initial noises. To mitigate these issues, we propose a two-stage watermarking framework for efficient detection. During generation, we augment the initial noise with generated Fourier patterns to embed information about the group of initial noises we used. For detection, we (i) retrieve the relevant group of noises, and (ii) search within the given group for an initial noise that might match our image. This watermarking approach achieves state-of-the-art robustness to forgery and removal against a large battery of attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04653v3-abstract-full').style.display = 'none'; document.getElementById('2412.04653v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01042">arXiv:2412.01042</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.01042">pdf</a>, <a href="https://arxiv.org/format/2412.01042">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TruncFormer: Private LLM Inference Using Only Truncations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yubeaton%2C+P">Patrick Yubeaton</a>, <a href="/search/cs?searchtype=author&amp;query=Mo%2C+J+C">Jianqiao Cambridge Mo</a>, <a href="/search/cs?searchtype=author&amp;query=Garimella%2C+K">Karthik Garimella</a>, <a href="/search/cs?searchtype=author&amp;query=Jha%2C+N+K">Nandan Kumar Jha</a>, <a href="/search/cs?searchtype=author&amp;query=Reagen%2C+B">Brandon Reagen</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Garg%2C+S">Siddharth Garg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01042v1-abstract-short" style="display: inline;"> Private inference (PI) serves an important role in guaranteeing the privacy of user data when interfacing with proprietary machine learning models such as LLMs. However, PI remains practically intractable due to the massive latency costs associated with nonlinear functions present in LLMs. Existing works have focused on improving latency of specific LLM nonlinearities (such as the Softmax, or the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01042v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01042v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01042v1-abstract-full" style="display: none;"> Private inference (PI) serves an important role in guaranteeing the privacy of user data when interfacing with proprietary machine learning models such as LLMs. However, PI remains practically intractable due to the massive latency costs associated with nonlinear functions present in LLMs. Existing works have focused on improving latency of specific LLM nonlinearities (such as the Softmax, or the GeLU) via approximations. However, new types of nonlinearities are regularly introduced with new LLM architectures, and this has led to a constant game of catch-up where PI researchers attempt to optimize the newest nonlinear function. We introduce TruncFormer, a framework for taking any LLM and transforming it into a plaintext emulation of PI. Our framework leverages the fact that nonlinearities in LLMs are differentiable and can be accurately approximated with a sequence of additions, multiplications, and truncations. Further, we decouple the add/multiply and truncation operations, and statically determine where truncations should be inserted based on a given field size and input representation size. This leads to latency improvements over existing cryptographic protocols that enforce truncation after every multiplication operation. We open source our code for community use. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01042v1-abstract-full').style.display = 'none'; document.getElementById('2412.01042v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13683">arXiv:2411.13683</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13683">pdf</a>, <a href="https://arxiv.org/format/2411.13683">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Extending Video Masked Autoencoders to 128 frames </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gundavarapu%2C+N+B">Nitesh Bharadwaj Gundavarapu</a>, <a href="/search/cs?searchtype=author&amp;query=Friedman%2C+L">Luke Friedman</a>, <a href="/search/cs?searchtype=author&amp;query=Goyal%2C+R">Raghav Goyal</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chaitra Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Agustsson%2C+E">Eirikur Agustsson</a>, <a href="/search/cs?searchtype=author&amp;query=Waghmare%2C+S+M">Sagar M. Waghmare</a>, <a href="/search/cs?searchtype=author&amp;query=Sirotenko%2C+M">Mikhail Sirotenko</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+M">Ming-Hsuan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Weyand%2C+T">Tobias Weyand</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+B">Boqing Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Sigal%2C+L">Leonid Sigal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13683v1-abstract-short" style="display: inline;"> Video understanding has witnessed significant progress with recent video foundation models demonstrating strong performance owing to self-supervised pre-training objectives; Masked Autoencoders (MAE) being the design of choice. Nevertheless, the majority of prior works that leverage MAE pre-training have focused on relatively short video representations (16 / 32 frames in length) largely due to ha&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13683v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13683v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13683v1-abstract-full" style="display: none;"> Video understanding has witnessed significant progress with recent video foundation models demonstrating strong performance owing to self-supervised pre-training objectives; Masked Autoencoders (MAE) being the design of choice. Nevertheless, the majority of prior works that leverage MAE pre-training have focused on relatively short video representations (16 / 32 frames in length) largely due to hardware memory and compute limitations that scale poorly with video length due to the dense memory-intensive self-attention decoding. One natural strategy to address these challenges is to subsample tokens to reconstruct during decoding (or decoder masking). In this work, we propose an effective strategy for prioritizing tokens which allows training on longer video sequences (128 frames) and gets better performance than, more typical, random and uniform masking strategies. The core of our approach is an adaptive decoder masking strategy that prioritizes the most important tokens and uses quantized tokens as reconstruction objectives. Our adaptive strategy leverages a powerful MAGVIT-based tokenizer that jointly learns the tokens and their priority. We validate our design choices through exhaustive ablations and observe improved performance of the resulting long-video (128 frames) encoders over short-video (32 frames) counterparts. With our long-video masked autoencoder (LVMAE) strategy, we surpass state-of-the-art on Diving48 by 3.9 points and EPIC-Kitchens-100 verb classification by 2.5 points while relying on a simple core architecture and video-only pre-training (unlike some of the prior works that require millions of labeled video-text pairs or specialized encoders). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13683v1-abstract-full').style.display = 'none'; document.getElementById('2411.13683v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10.5 pages of main paper, 25 pages total, 4 figures and 10 tables. To appear in NeurIPS&#39;24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05057">arXiv:2410.05057</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05057">pdf</a>, <a href="https://arxiv.org/format/2410.05057">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SELECT: A Large-Scale Benchmark of Data Curation Strategies for Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jiawei Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+N">Niv Cohen</a>, <a href="/search/cs?searchtype=author&amp;query=Yubeaton%2C+P">Patrick Yubeaton</a>, <a href="/search/cs?searchtype=author&amp;query=Mittal%2C+G">Govind Mittal</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05057v1-abstract-short" style="display: inline;"> Data curation is the problem of how to collect and organize samples into a dataset that supports efficient learning. Despite the centrality of the task, little work has been devoted towards a large-scale, systematic comparison of various curation methods. In this work, we take steps towards a formal evaluation of data curation strategies and introduce SELECT, the first large-scale benchmark of cur&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05057v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05057v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05057v1-abstract-full" style="display: none;"> Data curation is the problem of how to collect and organize samples into a dataset that supports efficient learning. Despite the centrality of the task, little work has been devoted towards a large-scale, systematic comparison of various curation methods. In this work, we take steps towards a formal evaluation of data curation strategies and introduce SELECT, the first large-scale benchmark of curation strategies for image classification. In order to generate baseline methods for the SELECT benchmark, we create a new dataset, ImageNet++, which constitutes the largest superset of ImageNet-1K to date. Our dataset extends ImageNet with 5 new training-data shifts, each approximately the size of ImageNet-1K itself, and each assembled using a distinct curation strategy. We evaluate our data curation baselines in two ways: (i) using each training-data shift to train identical image classification models from scratch (ii) using the data itself to fit a pretrained self-supervised representation. Our findings show interesting trends, particularly pertaining to recent methods for data curation such as synthetic data generation and lookup based on CLIP embeddings. We show that although these strategies are highly competitive for certain tasks, the curation strategy used to assemble the original ImageNet-1K dataset remains the gold standard. We anticipate that our benchmark can illuminate the path for new methods to further reduce the gap. We release our checkpoints, code, documentation, and a link to our dataset at https://github.com/jimmyxu123/SELECT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05057v1-abstract-full').style.display = 'none'; document.getElementById('2410.05057v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024, Datasets and Benchmarks Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18032">arXiv:2409.18032</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.18032">pdf</a>, <a href="https://arxiv.org/format/2409.18032">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> FlowBench: A Large Scale Benchmark for Flow Simulation over Complex Geometries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tali%2C+R">Ronak Tali</a>, <a href="/search/cs?searchtype=author&amp;query=Rabeh%2C+A">Ali Rabeh</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Cheng-Hau Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Shadkhah%2C+M">Mehdi Shadkhah</a>, <a href="/search/cs?searchtype=author&amp;query=Karki%2C+S">Samundra Karki</a>, <a href="/search/cs?searchtype=author&amp;query=Upadhyaya%2C+A">Abhisek Upadhyaya</a>, <a href="/search/cs?searchtype=author&amp;query=Dhakshinamoorthy%2C+S">Suriya Dhakshinamoorthy</a>, <a href="/search/cs?searchtype=author&amp;query=Saadati%2C+M">Marjan Saadati</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">Soumik Sarkar</a>, <a href="/search/cs?searchtype=author&amp;query=Krishnamurthy%2C+A">Adarsh Krishnamurthy</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Balu%2C+A">Aditya Balu</a>, <a href="/search/cs?searchtype=author&amp;query=Ganapathysubramanian%2C+B">Baskar Ganapathysubramanian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18032v1-abstract-short" style="display: inline;"> Simulating fluid flow around arbitrary shapes is key to solving various engineering problems. However, simulating flow physics across complex geometries remains numerically challenging and computationally resource-intensive, particularly when using conventional PDE solvers. Machine learning methods offer attractive opportunities to create fast and adaptable PDE solvers. However, benchmark datasets&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18032v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18032v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18032v1-abstract-full" style="display: none;"> Simulating fluid flow around arbitrary shapes is key to solving various engineering problems. However, simulating flow physics across complex geometries remains numerically challenging and computationally resource-intensive, particularly when using conventional PDE solvers. Machine learning methods offer attractive opportunities to create fast and adaptable PDE solvers. However, benchmark datasets to measure the performance of such methods are scarce, especially for flow physics across complex geometries. We introduce FlowBench, a dataset for neural simulators with over 10K samples, which is currently larger than any publicly available flow physics dataset. FlowBench contains flow simulation data across complex geometries (\textit{parametric vs. non-parametric}), spanning a range of flow conditions (\textit{Reynolds number and Grashoff number}), capturing a diverse array of flow phenomena (\textit{steady vs. transient; forced vs. free convection}), and for both 2D and 3D. FlowBench contains over 10K data samples, with each sample the outcome of a fully resolved, direct numerical simulation using a well-validated simulator framework designed for modeling transport phenomena in complex geometries. For each sample, we include velocity, pressure, and temperature field data at 3 different resolutions and several summary statistics features of engineering relevance (such as coefficients of lift and drag, and Nusselt numbers). %Additionally, we include masks and signed distance fields for each shape. We envision that FlowBench will enable evaluating the interplay between complex geometry, coupled flow phenomena, and data sufficiency on the performance of current, and future, neural PDE solvers. We enumerate several evaluation metrics to help rank order the performance of neural PDE solvers. We benchmark the performance of several baseline methods including FNO, CNO, WNO, and DeepONet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18032v1-abstract-full').style.display = 'none'; document.getElementById('2409.18032v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02278">arXiv:2409.02278</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.02278">pdf</a>, <a href="https://arxiv.org/format/2409.02278">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Evaluation and Comparison of Visual Language Models for Transportation Engineering Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Prajapati%2C+S">Sanjita Prajapati</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+T">Tanu Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Chakraborty%2C+P">Pranamesh Chakraborty</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02278v1-abstract-short" style="display: inline;"> Recent developments in vision language models (VLM) have shown great potential for diverse applications related to image understanding. In this study, we have explored state-of-the-art VLM models for vision-based transportation engineering tasks such as image classification and object detection. The image classification task involves congestion detection and crack identification, whereas, for obje&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02278v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02278v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02278v1-abstract-full" style="display: none;"> Recent developments in vision language models (VLM) have shown great potential for diverse applications related to image understanding. In this study, we have explored state-of-the-art VLM models for vision-based transportation engineering tasks such as image classification and object detection. The image classification task involves congestion detection and crack identification, whereas, for object detection, helmet violations were identified. We have applied open-source models such as CLIP, BLIP, OWL-ViT, Llava-Next, and closed-source GPT-4o to evaluate the performance of these state-of-the-art VLM models to harness the capabilities of language understanding for vision-based transportation tasks. These tasks were performed by applying zero-shot prompting to the VLM models, as zero-shot prompting involves performing tasks without any training on those tasks. It eliminates the need for annotated datasets or fine-tuning for specific tasks. Though these models gave comparative results with benchmark Convolutional Neural Networks (CNN) models in the image classification tasks, for object localization tasks, it still needs improvement. Therefore, this study provides a comprehensive evaluation of the state-of-the-art VLM models highlighting the advantages and limitations of the models, which can be taken as the baseline for future improvement and wide-scale implementation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02278v1-abstract-full').style.display = 'none'; document.getElementById('2409.02278v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10442">arXiv:2408.10442</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10442">pdf</a>, <a href="https://arxiv.org/format/2408.10442">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Feasibility of assessing cognitive impairment via distributed camera network and privacy-preserving edge computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chaitra Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Kiarashi%2C+Y">Yashar Kiarashi</a>, <a href="/search/cs?searchtype=author&amp;query=Levey%2C+A+I">Allan I Levey</a>, <a href="/search/cs?searchtype=author&amp;query=Rodriguez%2C+A+D">Amy D Rodriguez</a>, <a href="/search/cs?searchtype=author&amp;query=Kwon%2C+H">Hyeokhyen Kwon</a>, <a href="/search/cs?searchtype=author&amp;query=Clifford%2C+G+D">Gari D Clifford</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10442v1-abstract-short" style="display: inline;"> INTRODUCTION: Mild cognitive impairment (MCI) is characterized by a decline in cognitive functions beyond typical age and education-related expectations. Since, MCI has been linked to reduced social interactions and increased aimless movements, we aimed to automate the capture of these behaviors to enhance longitudinal monitoring. METHODS: Using a privacy-preserving distributed camera network, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10442v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10442v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10442v1-abstract-full" style="display: none;"> INTRODUCTION: Mild cognitive impairment (MCI) is characterized by a decline in cognitive functions beyond typical age and education-related expectations. Since, MCI has been linked to reduced social interactions and increased aimless movements, we aimed to automate the capture of these behaviors to enhance longitudinal monitoring. METHODS: Using a privacy-preserving distributed camera network, we collected movement and social interaction data from groups of individuals with MCI undergoing therapy within a 1700$m^2$ space. We developed movement and social interaction features, which were then used to train a series of machine learning algorithms to distinguish between higher and lower cognitive functioning MCI groups. RESULTS: A Wilcoxon rank-sum test revealed statistically significant differences between high and low-functioning cohorts in features such as linear path length, walking speed, change in direction while walking, entropy of velocity and direction change, and number of group formations in the indoor space. Despite lacking individual identifiers to associate with specific levels of MCI, a machine learning approach using the most significant features provided a 71% accuracy. DISCUSSION: We provide evidence to show that a privacy-preserving low-cost camera network using edge computing framework has the potential to distinguish between different levels of cognitive impairment from the movements and social interactions captured during group activities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10442v1-abstract-full').style.display = 'none'; document.getElementById('2408.10442v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19617">arXiv:2407.19617</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.19617">pdf</a>, <a href="https://arxiv.org/format/2407.19617">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AgEval: A Benchmark for Zero-Shot and Few-Shot Plant Stress Phenotyping with Multimodal LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Arshad%2C+M+A">Muhammad Arbab Arshad</a>, <a href="/search/cs?searchtype=author&amp;query=Jubery%2C+T+Z">Talukder Zaki Jubery</a>, <a href="/search/cs?searchtype=author&amp;query=Roy%2C+T">Tirtho Roy</a>, <a href="/search/cs?searchtype=author&amp;query=Nassiri%2C+R">Rim Nassiri</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+A+K">Asheesh K. Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+A">Arti Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Ganapathysubramanian%2C+B">Baskar Ganapathysubramanian</a>, <a href="/search/cs?searchtype=author&amp;query=Balu%2C+A">Aditya Balu</a>, <a href="/search/cs?searchtype=author&amp;query=Krishnamurthy%2C+A">Adarsh Krishnamurthy</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">Soumik Sarkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19617v1-abstract-short" style="display: inline;"> Plant stress phenotyping traditionally relies on expert assessments and specialized models, limiting scalability in agriculture. Recent advances in multimodal large language models (LLMs) offer potential solutions to this challenge. We present AgEval, a benchmark comprising 12 diverse plant stress phenotyping tasks, to evaluate these models&#39; capabilities. Our study assesses zero-shot and few-shot&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19617v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19617v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19617v1-abstract-full" style="display: none;"> Plant stress phenotyping traditionally relies on expert assessments and specialized models, limiting scalability in agriculture. Recent advances in multimodal large language models (LLMs) offer potential solutions to this challenge. We present AgEval, a benchmark comprising 12 diverse plant stress phenotyping tasks, to evaluate these models&#39; capabilities. Our study assesses zero-shot and few-shot in-context learning performance of state-of-the-art models, including Claude, GPT, Gemini, and LLaVA. Results show significant performance improvements with few-shot learning, with F1 scores increasing from 46.24% to 73.37% in 8-shot identification for the best-performing model. Few-shot examples from other classes in the dataset have negligible or negative impacts, although having the exact category example helps to increase performance by 15.38%. We also quantify the consistency of model performance across different classes within each task, finding that the coefficient of variance (CV) ranges from 26.02% to 58.03% across models, implying that subject matter expertise is needed - of &#39;difficult&#39; classes - to achieve reliability in performance. AgEval establishes baseline metrics for multimodal LLMs in agricultural applications, offering insights into their promise for enhancing plant stress phenotyping at scale. Benchmark and code can be accessed at: https://anonymous.4open.science/r/AgEval/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19617v1-abstract-full').style.display = 'none'; document.getElementById('2407.19617v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04180">arXiv:2407.04180</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04180">pdf</a>, <a href="https://arxiv.org/format/2407.04180">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Slice-100K: A Multimodal Dataset for Extrusion-based 3D Printing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jignasu%2C+A">Anushrut Jignasu</a>, <a href="/search/cs?searchtype=author&amp;query=Marshall%2C+K+O">Kelly O. Marshall</a>, <a href="/search/cs?searchtype=author&amp;query=Mishra%2C+A+K">Ankush Kumar Mishra</a>, <a href="/search/cs?searchtype=author&amp;query=Rillo%2C+L+N">Lucas Nerone Rillo</a>, <a href="/search/cs?searchtype=author&amp;query=Ganapathysubramanian%2C+B">Baskar Ganapathysubramanian</a>, <a href="/search/cs?searchtype=author&amp;query=Balu%2C+A">Aditya Balu</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Krishnamurthy%2C+A">Adarsh Krishnamurthy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04180v2-abstract-short" style="display: inline;"> G-code (Geometric code) or RS-274 is the most widely used computer numerical control (CNC) and 3D printing programming language. G-code provides machine instructions for the movement of the 3D printer, especially for the nozzle, stage, and extrusion of material for extrusion-based additive manufacturing. Currently there does not exist a large repository of curated CAD models along with their corre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04180v2-abstract-full').style.display = 'inline'; document.getElementById('2407.04180v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04180v2-abstract-full" style="display: none;"> G-code (Geometric code) or RS-274 is the most widely used computer numerical control (CNC) and 3D printing programming language. G-code provides machine instructions for the movement of the 3D printer, especially for the nozzle, stage, and extrusion of material for extrusion-based additive manufacturing. Currently there does not exist a large repository of curated CAD models along with their corresponding G-code files for additive manufacturing. To address this issue, we present SLICE-100K, a first-of-its-kind dataset of over 100,000 G-code files, along with their tessellated CAD model, LVIS (Large Vocabulary Instance Segmentation) categories, geometric properties, and renderings. We build our dataset from triangulated meshes derived from Objaverse-XL and Thingi10K datasets. We demonstrate the utility of this dataset by finetuning GPT-2 on a subset of the dataset for G-code translation from a legacy G-code format (Sailfish) to a more modern, widely used format (Marlin). SLICE-100K will be the first step in developing a multimodal foundation model for digital manufacturing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04180v2-abstract-full').style.display = 'none'; document.getElementById('2407.04180v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Replaced &#34;SLICE-100K&#34; with &#34;Slice-100K&#34;, added acknowledgements, and updated main figure to better capture shadows</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19314">arXiv:2406.19314</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.19314">pdf</a>, <a href="https://arxiv.org/format/2406.19314">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LiveBench: A Challenging, Contamination-Free LLM Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=White%2C+C">Colin White</a>, <a href="/search/cs?searchtype=author&amp;query=Dooley%2C+S">Samuel Dooley</a>, <a href="/search/cs?searchtype=author&amp;query=Roberts%2C+M">Manley Roberts</a>, <a href="/search/cs?searchtype=author&amp;query=Pal%2C+A">Arka Pal</a>, <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Ben Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Jain%2C+S">Siddhartha Jain</a>, <a href="/search/cs?searchtype=author&amp;query=Shwartz-Ziv%2C+R">Ravid Shwartz-Ziv</a>, <a href="/search/cs?searchtype=author&amp;query=Jain%2C+N">Neel Jain</a>, <a href="/search/cs?searchtype=author&amp;query=Saifullah%2C+K">Khalid Saifullah</a>, <a href="/search/cs?searchtype=author&amp;query=Naidu%2C+S">Siddartha Naidu</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=LeCun%2C+Y">Yann LeCun</a>, <a href="/search/cs?searchtype=author&amp;query=Goldstein%2C+T">Tom Goldstein</a>, <a href="/search/cs?searchtype=author&amp;query=Neiswanger%2C+W">Willie Neiswanger</a>, <a href="/search/cs?searchtype=author&amp;query=Goldblum%2C+M">Micah Goldblum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19314v1-abstract-short" style="display: inline;"> Test set contamination, wherein test data from a benchmark ends up in a newer model&#39;s training set, is a well-documented obstacle for fair LLM evaluation and can quickly render benchmarks obsolete. To mitigate this, many recent benchmarks crowdsource new prompts and evaluations from human or LLM judges; however, these can introduce significant biases, and break down when scoring hard questions. In&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19314v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19314v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19314v1-abstract-full" style="display: none;"> Test set contamination, wherein test data from a benchmark ends up in a newer model&#39;s training set, is a well-documented obstacle for fair LLM evaluation and can quickly render benchmarks obsolete. To mitigate this, many recent benchmarks crowdsource new prompts and evaluations from human or LLM judges; however, these can introduce significant biases, and break down when scoring hard questions. In this work, we introduce a new benchmark for LLMs designed to be immune to both test set contamination and the pitfalls of LLM judging and human crowdsourcing. We release LiveBench, the first benchmark that (1) contains frequently-updated questions from recent information sources, (2) scores answers automatically according to objective ground-truth values, and (3) contains a wide variety of challenging tasks, spanning math, coding, reasoning, language, instruction following, and data analysis. To achieve this, LiveBench contains questions that are based on recently-released math competitions, arXiv papers, news articles, and datasets, and it contains harder, contamination-free versions of tasks from previous benchmarks such as Big-Bench Hard, AMPS, and IFEval. We evaluate many prominent closed-source models, as well as dozens of open-source models ranging from 0.5B to 110B in size. LiveBench is difficult, with top models achieving below 65% accuracy. We release all questions, code, and model answers. Questions will be added and updated on a monthly basis, and we will release new tasks and harder versions of tasks over time so that LiveBench can distinguish between the capabilities of LLMs as they improve in the future. We welcome community engagement and collaboration for expanding the benchmark tasks and models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19314v1-abstract-full').style.display = 'none'; document.getElementById('2406.19314v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17720">arXiv:2406.17720</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.17720">pdf</a>, <a href="https://arxiv.org/format/2406.17720">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BioTrove: A Large Curated Image Dataset Enabling AI for Biodiversity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chih-Hsuan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Jubery%2C+Z">Zaki Jubery</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Z+K">Zi K. Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Nakkab%2C+A">Andre Nakkab</a>, <a href="/search/cs?searchtype=author&amp;query=Hasan%2C+M+Z">Md Zahid Hasan</a>, <a href="/search/cs?searchtype=author&amp;query=Chiranjeevi%2C+S">Shivani Chiranjeevi</a>, <a href="/search/cs?searchtype=author&amp;query=Marshall%2C+K">Kelly Marshall</a>, <a href="/search/cs?searchtype=author&amp;query=Baishnab%2C+N">Nirmal Baishnab</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+A+K">Asheesh K Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+A">Arti Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">Soumik Sarkar</a>, <a href="/search/cs?searchtype=author&amp;query=Merchant%2C+N">Nirav Merchant</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Ganapathysubramanian%2C+B">Baskar Ganapathysubramanian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17720v2-abstract-short" style="display: inline;"> We introduce BioTrove, the largest publicly accessible dataset designed to advance AI applications in biodiversity. Curated from the iNaturalist platform and vetted to include only research-grade data, BioTrove contains 161.9 million images, offering unprecedented scale and diversity from three primary kingdoms: Animalia (&#34;animals&#34;), Fungi (&#34;fungi&#34;), and Plantae (&#34;plants&#34;), spanning approximately&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17720v2-abstract-full').style.display = 'inline'; document.getElementById('2406.17720v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17720v2-abstract-full" style="display: none;"> We introduce BioTrove, the largest publicly accessible dataset designed to advance AI applications in biodiversity. Curated from the iNaturalist platform and vetted to include only research-grade data, BioTrove contains 161.9 million images, offering unprecedented scale and diversity from three primary kingdoms: Animalia (&#34;animals&#34;), Fungi (&#34;fungi&#34;), and Plantae (&#34;plants&#34;), spanning approximately 366.6K species. Each image is annotated with scientific names, taxonomic hierarchies, and common names, providing rich metadata to support accurate AI model development across diverse species and ecosystems. We demonstrate the value of BioTrove by releasing a suite of CLIP models trained using a subset of 40 million captioned images, known as BioTrove-Train. This subset focuses on seven categories within the dataset that are underrepresented in standard image recognition models, selected for their critical role in biodiversity and agriculture: Aves (&#34;birds&#34;), Arachnida (&#34;spiders/ticks/mites&#34;), Insecta (&#34;insects&#34;), Plantae (&#34;plants&#34;), Fungi (&#34;fungi&#34;), Mollusca (&#34;snails&#34;), and Reptilia (&#34;snakes/lizards&#34;). To support rigorous assessment, we introduce several new benchmarks and report model accuracy for zero-shot learning across life stages, rare species, confounding species, and multiple taxonomic levels. We anticipate that BioTrove will spur the development of AI models capable of supporting digital tools for pest control, crop monitoring, biodiversity assessment, and environmental conservation. These advancements are crucial for ensuring food security, preserving ecosystems, and mitigating the impacts of climate change. BioTrove is publicly available, easily accessible, and ready for immediate use. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17720v2-abstract-full').style.display = 'none'; document.getElementById('2406.17720v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09312">arXiv:2405.09312</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.09312">pdf</a>, <a href="https://arxiv.org/ps/2405.09312">ps</a>, <a href="https://arxiv.org/format/2405.09312">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Agnostic Active Learning of Single Index Models with Linear Sample Complexity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gajjar%2C+A">Aarshvi Gajjar</a>, <a href="/search/cs?searchtype=author&amp;query=Tai%2C+W+M">Wai Ming Tai</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+X">Xingyu Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Musco%2C+C">Christopher Musco</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09312v3-abstract-short" style="display: inline;"> We study active learning methods for single index models of the form $F({\mathbf x}) = f(\langle {\mathbf w}, {\mathbf x}\rangle)$, where $f:\mathbb{R} \to \mathbb{R}$ and ${\mathbf x,\mathbf w} \in \mathbb{R}^d$. In addition to their theoretical interest as simple examples of non-linear neural networks, single index models have received significant recent attention due to applications in scientif&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09312v3-abstract-full').style.display = 'inline'; document.getElementById('2405.09312v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09312v3-abstract-full" style="display: none;"> We study active learning methods for single index models of the form $F({\mathbf x}) = f(\langle {\mathbf w}, {\mathbf x}\rangle)$, where $f:\mathbb{R} \to \mathbb{R}$ and ${\mathbf x,\mathbf w} \in \mathbb{R}^d$. In addition to their theoretical interest as simple examples of non-linear neural networks, single index models have received significant recent attention due to applications in scientific machine learning like surrogate modeling for partial differential equations (PDEs). Such applications require sample-efficient active learning methods that are robust to adversarial noise. I.e., that work even in the challenging agnostic learning setting. We provide two main results on agnostic active learning of single index models. First, when $f$ is known and Lipschitz, we show that $\tilde{O}(d)$ samples collected via {statistical leverage score sampling} are sufficient to learn a near-optimal single index model. Leverage score sampling is simple to implement, efficient, and already widely used for actively learning linear models. Our result requires no assumptions on the data distribution, is optimal up to log factors, and improves quadratically on a recent ${O}(d^{2})$ bound of \cite{gajjar2023active}. Second, we show that $\tilde{O}(d)$ samples suffice even in the more difficult setting when $f$ is \emph{unknown}. Our results leverage tools from high dimensional probability, including Dudley&#39;s inequality and dual Sudakov minoration, as well as a novel, distribution-aware discretization of the class of Lipschitz functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09312v3-abstract-full').style.display = 'none'; document.getElementById('2405.09312v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08079">arXiv:2404.08079</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.08079">pdf</a>, <a href="https://arxiv.org/format/2404.08079">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> DIMAT: Decentralized Iterative Merging-And-Training for Deep Learning Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Saadati%2C+N">Nastaran Saadati</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+M">Minh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Saleem%2C+N">Nasla Saleem</a>, <a href="/search/cs?searchtype=author&amp;query=Waite%2C+J+R">Joshua R. Waite</a>, <a href="/search/cs?searchtype=author&amp;query=Balu%2C+A">Aditya Balu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zhanhong Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">Soumik Sarkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08079v1-abstract-short" style="display: inline;"> Recent advances in decentralized deep learning algorithms have demonstrated cutting-edge performance on various tasks with large pre-trained models. However, a pivotal prerequisite for achieving this level of competitiveness is the significant communication and computation overheads when updating these models, which prohibits the applications of them to real-world scenarios. To address this issue,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08079v1-abstract-full').style.display = 'inline'; document.getElementById('2404.08079v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08079v1-abstract-full" style="display: none;"> Recent advances in decentralized deep learning algorithms have demonstrated cutting-edge performance on various tasks with large pre-trained models. However, a pivotal prerequisite for achieving this level of competitiveness is the significant communication and computation overheads when updating these models, which prohibits the applications of them to real-world scenarios. To address this issue, drawing inspiration from advanced model merging techniques without requiring additional training, we introduce the Decentralized Iterative Merging-And-Training (DIMAT) paradigm--a novel decentralized deep learning framework. Within DIMAT, each agent is trained on their local data and periodically merged with their neighboring agents using advanced model merging techniques like activation matching until convergence is achieved. DIMAT provably converges with the best available rate for nonconvex functions with various first-order methods, while yielding tighter error bounds compared to the popular existing approaches. We conduct a comprehensive empirical analysis to validate DIMAT&#39;s superiority over baselines across diverse computer vision tasks sourced from multiple datasets. Empirical results validate our theoretical claims by showing that DIMAT attains faster and higher initial gain in accuracy with independent and identically distributed (IID) and non-IID data, incurring lower communication overhead. This DIMAT paradigm presents a new opportunity for the future decentralized learning, enhancing its adaptability to real-world with sparse and light-weight communication and computation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08079v1-abstract-full').style.display = 'none'; document.getElementById('2404.08079v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024 accepted paper, 22 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03631">arXiv:2404.03631</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.03631">pdf</a>, <a href="https://arxiv.org/format/2404.03631">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robust Concept Erasure Using Task Vectors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+M">Minh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Marshall%2C+K+O">Kelly O. Marshall</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+N">Niv Cohen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03631v1-abstract-short" style="display: inline;"> With the rapid growth of text-to-image models, a variety of techniques have been suggested to prevent undesirable image generations. Yet, these methods often only protect against specific user prompts and have been shown to allow unsafe generations with other inputs. Here we focus on unconditionally erasing a concept from a text-to-image model rather than conditioning the erasure on the user&#39;s pro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03631v1-abstract-full').style.display = 'inline'; document.getElementById('2404.03631v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03631v1-abstract-full" style="display: none;"> With the rapid growth of text-to-image models, a variety of techniques have been suggested to prevent undesirable image generations. Yet, these methods often only protect against specific user prompts and have been shown to allow unsafe generations with other inputs. Here we focus on unconditionally erasing a concept from a text-to-image model rather than conditioning the erasure on the user&#39;s prompt. We first show that compared to input-dependent erasure methods, concept erasure that uses Task Vectors (TV) is more robust to unexpected user inputs, not seen during training. However, TV-based erasure can also affect the core performance of the edited model, particularly when the required edit strength is unknown. To this end, we propose a method called Diverse Inversion, which we use to estimate the required strength of the TV edit. Diverse Inversion finds within the model input space a large set of word embeddings, each of which induces the generation of the target concept. We find that encouraging diversity in the set makes our estimation more robust to unexpected prompts. Finally, we show that Diverse Inversion enables us to apply a TV edit only to a subset of the model weights, enhancing the erasure capabilities while better maintaining the core functionality of the model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03631v1-abstract-full').style.display = 'none'; document.getElementById('2404.03631v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08092">arXiv:2403.08092</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.08092">pdf</a>, <a href="https://arxiv.org/format/2403.08092">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mitigating the Impact of Attribute Editing on Face Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Banerjee%2C+S">Sudipta Banerjee</a>, <a href="/search/cs?searchtype=author&amp;query=Mullangi%2C+S+P">Sai Pranaswi Mullangi</a>, <a href="/search/cs?searchtype=author&amp;query=Wagle%2C+S">Shruti Wagle</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Memon%2C+N">Nasir Memon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08092v2-abstract-short" style="display: inline;"> Through a large-scale study over diverse face images, we show that facial attribute editing using modern generative AI models can severely degrade automated face recognition systems. This degradation persists even with identity-preserving generative models. To mitigate this issue, we propose two novel techniques for local and global attribute editing. We empirically ablate twenty-six facial semant&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08092v2-abstract-full').style.display = 'inline'; document.getElementById('2403.08092v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08092v2-abstract-full" style="display: none;"> Through a large-scale study over diverse face images, we show that facial attribute editing using modern generative AI models can severely degrade automated face recognition systems. This degradation persists even with identity-preserving generative models. To mitigate this issue, we propose two novel techniques for local and global attribute editing. We empirically ablate twenty-six facial semantic, demographic and expression-based attributes that have been edited using state-of-the-art generative models, and evaluate them using ArcFace and AdaFace matchers on CelebA, CelebAMaskHQ and LFW datasets. Finally, we use LLaVA, an emerging visual question-answering framework for attribute prediction to validate our editing techniques. Our methods outperform the current state-of-the-art at facial editing (BLIP, InstantID) while improving identity retention by a significant extent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08092v2-abstract-full').style.display = 'none'; document.getElementById('2403.08092v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.18085">arXiv:2402.18085</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.18085">pdf</a>, <a href="https://arxiv.org/format/2402.18085">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> PITCH: AI-assisted Tagging of Deepfake Audio Calls using Challenge-Response </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mittal%2C+G">Govind Mittal</a>, <a href="/search/cs?searchtype=author&amp;query=Jakobsson%2C+A">Arthur Jakobsson</a>, <a href="/search/cs?searchtype=author&amp;query=Marshall%2C+K+O">Kelly O. Marshall</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Memon%2C+N">Nasir Memon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.18085v3-abstract-short" style="display: inline;"> The rise of AI voice-cloning technology, particularly audio Real-time Deepfakes (RTDFs), has intensified social engineering attacks by enabling real-time voice impersonation that bypasses conventional enrollment-based authentication. To address this, we propose PITCH, a robust challenge-response method to detect and tag interactive deepfake audio calls. We developed a comprehensive taxonomy of aud&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18085v3-abstract-full').style.display = 'inline'; document.getElementById('2402.18085v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.18085v3-abstract-full" style="display: none;"> The rise of AI voice-cloning technology, particularly audio Real-time Deepfakes (RTDFs), has intensified social engineering attacks by enabling real-time voice impersonation that bypasses conventional enrollment-based authentication. To address this, we propose PITCH, a robust challenge-response method to detect and tag interactive deepfake audio calls. We developed a comprehensive taxonomy of audio challenges based on the human auditory system, linguistics, and environmental factors, yielding 20 prospective challenges. These were tested against leading voice-cloning systems using a novel dataset comprising 18,600 original and 1.6 million deepfake samples from 100 users. PITCH&#39;s prospective challenges enhanced machine detection capabilities to 88.7% AUROC score on the full unbalanced dataset, enabling us to shortlist 10 functional challenges that balance security and usability. For human evaluation and subsequent analyses, we filtered a challenging, balanced subset. On this subset, human evaluators independently scored 72.6% accuracy, while machines achieved 87.7%. Acknowledging that call environments require higher human control, we aided call receivers in making decisions with them using machines. Our solution uses an early warning system to tag suspicious incoming calls as &#34;Deepfake-likely.&#34; Contrary to prior findings, we discovered that integrating human intuition with machine precision offers complementary advantages. Our solution gave users maximum control and boosted detection accuracy to 84.5%. Evidenced by this jump in accuracy, PITCH demonstrated the potential for AI-assisted pre-screening in call verification processes, offering an adaptable and usable approach to combat real-time voice-cloning attacks. Code to reproduce and access data at \url{https://github.com/mittalgovind/PITCH-Deepfakes}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18085v3-abstract-full').style.display = 'none'; document.getElementById('2402.18085v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.11137">arXiv:2402.11137</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.11137">pdf</a>, <a href="https://arxiv.org/format/2402.11137">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TuneTables: Context Optimization for Scalable Prior-Data Fitted Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Schirrmeister%2C+R+T">Robin Tibor Schirrmeister</a>, <a href="/search/cs?searchtype=author&amp;query=Cherepanova%2C+V">Valeriia Cherepanova</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Hutter%2C+F">Frank Hutter</a>, <a href="/search/cs?searchtype=author&amp;query=Goldblum%2C+M">Micah Goldblum</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+N">Niv Cohen</a>, <a href="/search/cs?searchtype=author&amp;query=White%2C+C">Colin White</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.11137v3-abstract-short" style="display: inline;"> While tabular classification has traditionally relied on from-scratch training, a recent breakthrough called prior-data fitted networks (PFNs) challenges this approach. Similar to large language models, PFNs make use of pretraining and in-context learning to achieve strong performance on new tasks in a single forward pass. However, current PFNs have limitations that prohibit their widespread adopt&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11137v3-abstract-full').style.display = 'inline'; document.getElementById('2402.11137v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.11137v3-abstract-full" style="display: none;"> While tabular classification has traditionally relied on from-scratch training, a recent breakthrough called prior-data fitted networks (PFNs) challenges this approach. Similar to large language models, PFNs make use of pretraining and in-context learning to achieve strong performance on new tasks in a single forward pass. However, current PFNs have limitations that prohibit their widespread adoption. Notably, TabPFN achieves very strong performance on small tabular datasets but is not designed to make predictions for datasets of size larger than 1000. In this work, we overcome these limitations and substantially improve the performance of PFNs via context optimization. We introduce TuneTables, a parameter-efficient fine-tuning strategy for PFNs that compresses large datasets into a smaller learned context. We conduct extensive experiments on 19 algorithms over 98 datasets and find that TuneTables achieves the best performance on average, outperforming boosted trees such as CatBoost, while optimizing fewer than 5% of TabPFN&#39;s parameters. Furthermore, we show that TuneTables can be used as an interpretability tool and can even be used to mitigate biases by optimizing a fairness objective. We open-source our code and raw results at https://github.com/penfever/TuneTables. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11137v3-abstract-full').style.display = 'none'; document.getElementById('2402.11137v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Poster</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.10609">arXiv:2311.10609</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.10609">pdf</a>, <a href="https://arxiv.org/format/2311.10609">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Scaling TabPFN: Sketching and Feature Selection for Tabular Prior-Data Fitted Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+N">Niv Cohen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.10609v1-abstract-short" style="display: inline;"> Tabular classification has traditionally relied on supervised algorithms, which estimate the parameters of a prediction model using its training data. Recently, Prior-Data Fitted Networks (PFNs) such as TabPFN have successfully learned to classify tabular data in-context: the model parameters are designed to classify new samples based on labelled training samples given after the model training. Wh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10609v1-abstract-full').style.display = 'inline'; document.getElementById('2311.10609v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.10609v1-abstract-full" style="display: none;"> Tabular classification has traditionally relied on supervised algorithms, which estimate the parameters of a prediction model using its training data. Recently, Prior-Data Fitted Networks (PFNs) such as TabPFN have successfully learned to classify tabular data in-context: the model parameters are designed to classify new samples based on labelled training samples given after the model training. While such models show great promise, their applicability to real-world data remains limited due to the computational scale needed. Here we study the following question: given a pre-trained PFN for tabular data, what is the best way to summarize the labelled training samples before feeding them to the model? We conduct an initial investigation of sketching and feature-selection methods for TabPFN, and note certain key differences between it and conventionally fitted tabular models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10609v1-abstract-full').style.display = 'none'; document.getElementById('2311.10609v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2nd Table Representation Learning Workshop: 37th Conference on Neural Information Processing Systems (NeurIPS 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.09024">arXiv:2311.09024</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.09024">pdf</a>, <a href="https://arxiv.org/format/2311.09024">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fast Certification of Vision-Language Models Using Incremental Randomized Smoothing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nirala%2C+A+K">A K Nirala</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+A">A Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">C Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">S Sarkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.09024v2-abstract-short" style="display: inline;"> A key benefit of deep vision-language models such as CLIP is that they enable zero-shot open vocabulary classification; the user has the ability to define novel class labels via natural language prompts at inference time. However, while CLIP-based zero-shot classifiers have demonstrated competitive performance across a range of domain shifts, they remain highly vulnerable to adversarial attacks. T&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09024v2-abstract-full').style.display = 'inline'; document.getElementById('2311.09024v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.09024v2-abstract-full" style="display: none;"> A key benefit of deep vision-language models such as CLIP is that they enable zero-shot open vocabulary classification; the user has the ability to define novel class labels via natural language prompts at inference time. However, while CLIP-based zero-shot classifiers have demonstrated competitive performance across a range of domain shifts, they remain highly vulnerable to adversarial attacks. Therefore, ensuring the robustness of such models is crucial for their reliable deployment in the wild. In this work, we introduce Open Vocabulary Certification (OVC), a fast certification method designed for open-vocabulary models like CLIP via randomized smoothing techniques. Given a base &#34;training&#34; set of prompts and their corresponding certified CLIP classifiers, OVC relies on the observation that a classifier with a novel prompt can be viewed as a perturbed version of nearby classifiers in the base training set. Therefore, OVC can rapidly certify the novel classifier using a variation of incremental randomized smoothing. By using a caching trick, we achieve approximately two orders of magnitude acceleration in the certification process for novel prompts. To achieve further (heuristic) speedups, OVC approximates the embedding space at a given input using a multivariate normal distribution bypassing the need for sampling via forward passes through the vision backbone. We demonstrate the effectiveness of OVC on through experimental evaluation using multiple vision-language backbones on the CIFAR-10 and ImageNet test datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09024v2-abstract-full').style.display = 'none'; document.getElementById('2311.09024v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.04016">arXiv:2311.04016</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.04016">pdf</a>, <a href="https://arxiv.org/ps/2311.04016">ps</a>, <a href="https://arxiv.org/format/2311.04016">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring Dataset-Scale Indicators of Data Quality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.04016v1-abstract-short" style="display: inline;"> Modern computer vision foundation models are trained on massive amounts of data, incurring large economic and environmental costs. Recent research has suggested that improving data quality can significantly reduce the need for data quantity. But what constitutes data quality in computer vision? We posit that the quality of a given dataset can be decomposed into distinct sample-level and dataset-le&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.04016v1-abstract-full').style.display = 'inline'; document.getElementById('2311.04016v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.04016v1-abstract-full" style="display: none;"> Modern computer vision foundation models are trained on massive amounts of data, incurring large economic and environmental costs. Recent research has suggested that improving data quality can significantly reduce the need for data quantity. But what constitutes data quality in computer vision? We posit that the quality of a given dataset can be decomposed into distinct sample-level and dataset-level constituents, and that the former have been more extensively studied than the latter. We ablate the effects of two important dataset-level constituents: label set design, and class balance. By monitoring these constituents using key indicators we provide, researchers and practitioners can better anticipate model performance, measured in terms of its accuracy and robustness to distribution shifts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.04016v1-abstract-full').style.display = 'none'; document.getElementById('2311.04016v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">1st Workshop on Attributing Model Behavior at Scale: 37th Conference on Neural Information Processing Systems (NeurIPS 2023). 7 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.18208">arXiv:2310.18208</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.18208">pdf</a>, <a href="https://arxiv.org/format/2310.18208">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.14778/3665844.3665857">10.14778/3665844.3665857 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> ArcheType: A Novel Framework for Open-Source Column Type Annotation using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yurong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Freire%2C+J">Juliana Freire</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.18208v3-abstract-short" style="display: inline;"> Existing deep-learning approaches to semantic column type annotation (CTA) have important shortcomings: they rely on semantic types which are fixed at training time; require a large number of training samples per type and incur large run-time inference costs; and their performance can degrade when evaluated on novel datasets, even when types remain constant. Large language models have exhibited st&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.18208v3-abstract-full').style.display = 'inline'; document.getElementById('2310.18208v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.18208v3-abstract-full" style="display: none;"> Existing deep-learning approaches to semantic column type annotation (CTA) have important shortcomings: they rely on semantic types which are fixed at training time; require a large number of training samples per type and incur large run-time inference costs; and their performance can degrade when evaluated on novel datasets, even when types remain constant. Large language models have exhibited strong zero-shot classification performance on a wide range of tasks and in this paper we explore their use for CTA. We introduce ArcheType, a simple, practical method for context sampling, prompt serialization, model querying, and label remapping, which enables large language models to solve CTA problems in a fully zero-shot manner. We ablate each component of our method separately, and establish that improvements to context sampling and label remapping provide the most consistent gains. ArcheType establishes a new state-of-the-art performance on zero-shot CTA benchmarks (including three new domain-specific benchmarks which we release along with this paper), and when used in conjunction with classical CTA techniques, it outperforms a SOTA DoDuo model on the fine-tuned SOTAB benchmark. Our code is available at https://github.com/penfever/ArcheType. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.18208v3-abstract-full').style.display = 'none'; document.getElementById('2310.18208v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">VLDB 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.3.3; H.3; I.2; I.2.7 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the VLDB Endowment, Volume 17, Issue 9, Pages 2279 - 2292, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.04604">arXiv:2310.04604</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.04604">pdf</a>, <a href="https://arxiv.org/format/2310.04604">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PriViT: Vision Transformers for Fast Private Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dhyani%2C+N">Naren Dhyani</a>, <a href="/search/cs?searchtype=author&amp;query=Mo%2C+J">Jianqiao Mo</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+A">Ameya Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&amp;query=Reagen%2C+B">Brandon Reagen</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.04604v1-abstract-short" style="display: inline;"> The Vision Transformer (ViT) architecture has emerged as the backbone of choice for state-of-the-art deep models for computer vision applications. However, ViTs are ill-suited for private inference using secure multi-party computation (MPC) protocols, due to the large number of non-polynomial operations (self-attention, feed-forward rectifiers, layer normalization). We propose PriViT, a gradient b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04604v1-abstract-full').style.display = 'inline'; document.getElementById('2310.04604v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.04604v1-abstract-full" style="display: none;"> The Vision Transformer (ViT) architecture has emerged as the backbone of choice for state-of-the-art deep models for computer vision applications. However, ViTs are ill-suited for private inference using secure multi-party computation (MPC) protocols, due to the large number of non-polynomial operations (self-attention, feed-forward rectifiers, layer normalization). We propose PriViT, a gradient based algorithm to selectively &#34;Taylorize&#34; nonlinearities in ViTs while maintaining their prediction accuracy. Our algorithm is conceptually simple, easy to implement, and achieves improved performance over existing approaches for designing MPC-friendly transformer architectures in terms of achieving the Pareto frontier in latency-accuracy. We confirm these improvements via experiments on several standard image classification tasks. Public code is available at https://github.com/NYU-DICE-Lab/privit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04604v1-abstract-full').style.display = 'none'; document.getElementById('2310.04604v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.05795">arXiv:2309.05795</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.05795">pdf</a>, <a href="https://arxiv.org/ps/2309.05795">ps</a>, <a href="https://arxiv.org/format/2309.05795">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On the Fine-Grained Hardness of Inverting Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Keles%2C+F+D">Feyza Duman Keles</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.05795v1-abstract-short" style="display: inline;"> The objective of generative model inversion is to identify a size-$n$ latent vector that produces a generative model output that closely matches a given target. This operation is a core computational primitive in numerous modern applications involving computer vision and NLP. However, the problem is known to be computationally challenging and NP-hard in the worst case. This paper aims to provide a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.05795v1-abstract-full').style.display = 'inline'; document.getElementById('2309.05795v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.05795v1-abstract-full" style="display: none;"> The objective of generative model inversion is to identify a size-$n$ latent vector that produces a generative model output that closely matches a given target. This operation is a core computational primitive in numerous modern applications involving computer vision and NLP. However, the problem is known to be computationally challenging and NP-hard in the worst case. This paper aims to provide a fine-grained view of the landscape of computational hardness for this problem. We establish several new hardness lower bounds for both exact and approximate model inversion. In exact inversion, the goal is to determine whether a target is contained within the range of a given generative model. Under the strong exponential time hypothesis (SETH), we demonstrate that the computational complexity of exact inversion is lower bounded by $惟(2^n)$ via a reduction from $k$-SAT; this is a strengthening of known results. For the more practically relevant problem of approximate inversion, the goal is to determine whether a point in the model range is close to a given target with respect to the $\ell_p$-norm. When $p$ is a positive odd integer, under SETH, we provide an $惟(2^n)$ complexity lower bound via a reduction from the closest vectors problem (CVP). Finally, when $p$ is even, under the exponential time hypothesis (ETH), we provide a lower bound of $2^{惟(n)}$ via a reduction from Half-Clique and Vertex-Cover. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.05795v1-abstract-full').style.display = 'none'; document.getElementById('2309.05795v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.02465">arXiv:2309.02465</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.02465">pdf</a>, <a href="https://arxiv.org/format/2309.02465">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Foundational AI Models for Additive Manufacturing: Language Models for G-Code Debugging, Manipulation, and Comprehension </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jignasu%2C+A">Anushrut Jignasu</a>, <a href="/search/cs?searchtype=author&amp;query=Marshall%2C+K">Kelly Marshall</a>, <a href="/search/cs?searchtype=author&amp;query=Ganapathysubramanian%2C+B">Baskar Ganapathysubramanian</a>, <a href="/search/cs?searchtype=author&amp;query=Balu%2C+A">Aditya Balu</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Krishnamurthy%2C+A">Adarsh Krishnamurthy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.02465v1-abstract-short" style="display: inline;"> 3D printing or additive manufacturing is a revolutionary technology that enables the creation of physical objects from digital models. However, the quality and accuracy of 3D printing depend on the correctness and efficiency of the G-code, a low-level numerical control programming language that instructs 3D printers how to move and extrude material. Debugging G-code is a challenging task that requ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02465v1-abstract-full').style.display = 'inline'; document.getElementById('2309.02465v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.02465v1-abstract-full" style="display: none;"> 3D printing or additive manufacturing is a revolutionary technology that enables the creation of physical objects from digital models. However, the quality and accuracy of 3D printing depend on the correctness and efficiency of the G-code, a low-level numerical control programming language that instructs 3D printers how to move and extrude material. Debugging G-code is a challenging task that requires a syntactic and semantic understanding of the G-code format and the geometry of the part to be printed. In this paper, we present the first extensive evaluation of six state-of-the-art foundational large language models (LLMs) for comprehending and debugging G-code files for 3D printing. We design effective prompts to enable pre-trained LLMs to understand and manipulate G-code and test their performance on various aspects of G-code debugging and manipulation, including detection and correction of common errors and the ability to perform geometric transformations. We analyze their strengths and weaknesses for understanding complete G-code files. We also discuss the implications and limitations of using LLMs for G-code comprehension. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02465v1-abstract-full').style.display = 'none'; document.getElementById('2309.02465v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.03821">arXiv:2308.03821</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.03821">pdf</a>, <a href="https://arxiv.org/format/2308.03821">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Distributionally Robust Classification on a Data Budget </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+A">Ameya Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+M">Minh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.03821v1-abstract-short" style="display: inline;"> Real world uses of deep learning require predictable model behavior under distribution shifts. Models such as CLIP show emergent natural distributional robustness comparable to humans, but may require hundreds of millions of training samples. Can we train robust learners in a domain where data is limited? To rigorously address this question, we introduce JANuS (Joint Annotations and Names Set), a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03821v1-abstract-full').style.display = 'inline'; document.getElementById('2308.03821v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.03821v1-abstract-full" style="display: none;"> Real world uses of deep learning require predictable model behavior under distribution shifts. Models such as CLIP show emergent natural distributional robustness comparable to humans, but may require hundreds of millions of training samples. Can we train robust learners in a domain where data is limited? To rigorously address this question, we introduce JANuS (Joint Annotations and Names Set), a collection of four new training datasets with images, labels, and corresponding captions, and perform a series of carefully controlled investigations of factors contributing to robustness in image classification, then compare those results to findings derived from a large-scale meta-analysis. Using this approach, we show that standard ResNet-50 trained with the cross-entropy loss on 2.4 million image samples can attain comparable robustness to a CLIP ResNet-50 trained on 400 million samples. To our knowledge, this is the first result showing (near) state-of-the-art distributional robustness on limited data budgets. Our dataset is available at \url{https://huggingface.co/datasets/penfever/JANuS_dataset}, and the code used to reproduce our experiments can be found at \url{https://github.com/penfever/vlhub/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03821v1-abstract-full').style.display = 'none'; document.getElementById('2308.03821v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">TMLR 2023; openreview link: https://openreview.net/forum?id=D5Z2E8CNsD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.01508">arXiv:2308.01508</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.01508">pdf</a>, <a href="https://arxiv.org/format/2308.01508">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Circumventing Concept Erasure Methods For Text-to-Image Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+M">Minh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Marshall%2C+K+O">Kelly O. Marshall</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+N">Niv Cohen</a>, <a href="/search/cs?searchtype=author&amp;query=Mittal%2C+G">Govind Mittal</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.01508v2-abstract-short" style="display: inline;"> Text-to-image generative models can produce photo-realistic images for an extremely broad range of concepts, and their usage has proliferated widely among the general public. On the flip side, these models have numerous drawbacks, including their potential to generate images featuring sexually explicit content, mirror artistic styles without permission, or even hallucinate (or deepfake) the likene&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.01508v2-abstract-full').style.display = 'inline'; document.getElementById('2308.01508v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.01508v2-abstract-full" style="display: none;"> Text-to-image generative models can produce photo-realistic images for an extremely broad range of concepts, and their usage has proliferated widely among the general public. On the flip side, these models have numerous drawbacks, including their potential to generate images featuring sexually explicit content, mirror artistic styles without permission, or even hallucinate (or deepfake) the likenesses of celebrities. Consequently, various methods have been proposed in order to &#34;erase&#34; sensitive concepts from text-to-image models. In this work, we examine five recently proposed concept erasure methods, and show that targeted concepts are not fully excised from any of these methods. Specifically, we leverage the existence of special learned word embeddings that can retrieve &#34;erased&#34; concepts from the sanitized models with no alterations to their weights. Our results highlight the brittleness of post hoc concept erasure methods, and call into question their use in the algorithmic toolkit for AI safety. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.01508v2-abstract-full').style.display = 'none'; document.getElementById('2308.01508v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.08585">arXiv:2307.08585</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.08585">pdf</a>, <a href="https://arxiv.org/format/2307.08585">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Identity-Preserving Aging of Face Images via Latent Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Banerjee%2C+S">Sudipta Banerjee</a>, <a href="/search/cs?searchtype=author&amp;query=Mittal%2C+G">Govind Mittal</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+A">Ameya Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Memon%2C+N">Nasir Memon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.08585v1-abstract-short" style="display: inline;"> The performance of automated face recognition systems is inevitably impacted by the facial aging process. However, high quality datasets of individuals collected over several years are typically small in scale. In this work, we propose, train, and validate the use of latent text-to-image diffusion models for synthetically aging and de-aging face images. Our models succeed with few-shot training, a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.08585v1-abstract-full').style.display = 'inline'; document.getElementById('2307.08585v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.08585v1-abstract-full" style="display: none;"> The performance of automated face recognition systems is inevitably impacted by the facial aging process. However, high quality datasets of individuals collected over several years are typically small in scale. In this work, we propose, train, and validate the use of latent text-to-image diffusion models for synthetically aging and de-aging face images. Our models succeed with few-shot training, and have the added benefit of being controllable via intuitive textual prompting. We observe high degrees of visual realism in the generated images while maintaining biometric fidelity measured by commonly used metrics. We evaluate our method on two benchmark datasets (CelebA and AgeDB) and observe significant reduction (~44%) in the False Non-Match Rate compared to existing state-of the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.08585v1-abstract-full').style.display = 'none'; document.getElementById('2307.08585v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to appear in International Joint Conference in Biometrics (IJCB) 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.10159">arXiv:2306.10159</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.10159">pdf</a>, <a href="https://arxiv.org/format/2306.10159">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Vision-Language Models can Identify Distracted Driver Behavior from Naturalistic Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hasan%2C+M+Z">Md Zahid Hasan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiajing Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiyang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Rahman%2C+M+S">Mohammed Shaiqur Rahman</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+A">Ameya Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Velipasalar%2C+S">Senem Velipasalar</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Sharma%2C+A">Anuj Sharma</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">Soumik Sarkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.10159v4-abstract-short" style="display: inline;"> Recognizing the activities causing distraction in real-world driving scenarios is critical for ensuring the safety and reliability of both drivers and pedestrians on the roadways. Conventional computer vision techniques are typically data-intensive and require a large volume of annotated training data to detect and classify various distracted driving behaviors, thereby limiting their efficiency an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10159v4-abstract-full').style.display = 'inline'; document.getElementById('2306.10159v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.10159v4-abstract-full" style="display: none;"> Recognizing the activities causing distraction in real-world driving scenarios is critical for ensuring the safety and reliability of both drivers and pedestrians on the roadways. Conventional computer vision techniques are typically data-intensive and require a large volume of annotated training data to detect and classify various distracted driving behaviors, thereby limiting their efficiency and scalability. We aim to develop a generalized framework that showcases robust performance with access to limited or no annotated training data. Recently, vision-language models have offered large-scale visual-textual pretraining that can be adapted to task-specific learning like distracted driving activity recognition. Vision-language pretraining models, such as CLIP, have shown significant promise in learning natural language-guided visual representations. This paper proposes a CLIP-based driver activity recognition approach that identifies driver distraction from naturalistic driving images and videos. CLIP&#39;s vision embedding offers zero-shot transfer and task-based finetuning, which can classify distracted activities from driving video data. Our results show that this framework offers state-of-the-art performance on zero-shot transfer and video-based CLIP for predicting the driver&#39;s state on two public datasets. We propose both frame-based and video-based frameworks developed on top of the CLIP&#39;s visual representation for distracted driving detection and classification tasks and report the results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10159v4-abstract-full').style.display = 'none'; document.getElementById('2306.10159v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.08183">arXiv:2306.08183</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.08183">pdf</a>, <a href="https://arxiv.org/format/2306.08183">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ZeroForge: Feedforward Text-to-Shape Without 3D Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Marshall%2C+K+O">Kelly O. Marshall</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+M">Minh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+A">Ameya Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Jignasu%2C+A">Anushrut Jignasu</a>, <a href="/search/cs?searchtype=author&amp;query=Balu%2C+A">Aditya Balu</a>, <a href="/search/cs?searchtype=author&amp;query=Krishnamurthy%2C+A">Adarsh Krishnamurthy</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.08183v2-abstract-short" style="display: inline;"> Current state-of-the-art methods for text-to-shape generation either require supervised training using a labeled dataset of pre-defined 3D shapes, or perform expensive inference-time optimization of implicit neural representations. In this work, we present ZeroForge, an approach for zero-shot text-to-shape generation that avoids both pitfalls. To achieve open-vocabulary shape generation, we requir&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08183v2-abstract-full').style.display = 'inline'; document.getElementById('2306.08183v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.08183v2-abstract-full" style="display: none;"> Current state-of-the-art methods for text-to-shape generation either require supervised training using a labeled dataset of pre-defined 3D shapes, or perform expensive inference-time optimization of implicit neural representations. In this work, we present ZeroForge, an approach for zero-shot text-to-shape generation that avoids both pitfalls. To achieve open-vocabulary shape generation, we require careful architectural adaptation of existing feed-forward approaches, as well as a combination of data-free CLIP-loss and contrastive losses to avoid mode collapse. Using these techniques, we are able to considerably expand the generative ability of existing feed-forward text-to-shape models such as CLIP-Forge. We support our method via extensive qualitative and quantitative evaluations <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08183v2-abstract-full').style.display = 'none'; document.getElementById('2306.08183v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, High resolution figures needed to demonstrate 3D results</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.05062">arXiv:2305.05062</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.05062">pdf</a>, <a href="https://arxiv.org/format/2305.05062">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JISPIN.2023.3337189">10.1109/JISPIN.2023.3337189 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Feasibility Study on Indoor Localization and Multi-person Tracking Using Sparsely Distributed Camera Network with Edge Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kwon%2C+H">Hyeokhyen Kwon</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chaitra Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Kiarashi%2C+Y">Yashar Kiarashi</a>, <a href="/search/cs?searchtype=author&amp;query=Madala%2C+V+S+K">Venkata Siva Krishna Madala</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+R">Ratan Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Nakum%2C+A">ArjunSinh Nakum</a>, <a href="/search/cs?searchtype=author&amp;query=Tweedy%2C+R">Robert Tweedy</a>, <a href="/search/cs?searchtype=author&amp;query=Tonetto%2C+L+M">Leandro Miletto Tonetto</a>, <a href="/search/cs?searchtype=author&amp;query=Zimring%2C+C+M">Craig M. Zimring</a>, <a href="/search/cs?searchtype=author&amp;query=Doiron%2C+M">Matthew Doiron</a>, <a href="/search/cs?searchtype=author&amp;query=Rodriguez%2C+A+D">Amy D. Rodriguez</a>, <a href="/search/cs?searchtype=author&amp;query=Levey%2C+A+I">Allan I. Levey</a>, <a href="/search/cs?searchtype=author&amp;query=Clifford%2C+G+D">Gari D. Clifford</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.05062v2-abstract-short" style="display: inline;"> Camera-based activity monitoring systems are becoming an attractive solution for smart building applications with the advances in computer vision and edge computing technologies. In this paper, we present a feasibility study and systematic analysis of a camera-based indoor localization and multi-person tracking system implemented on edge computing devices within a large indoor space. To this end,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05062v2-abstract-full').style.display = 'inline'; document.getElementById('2305.05062v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.05062v2-abstract-full" style="display: none;"> Camera-based activity monitoring systems are becoming an attractive solution for smart building applications with the advances in computer vision and edge computing technologies. In this paper, we present a feasibility study and systematic analysis of a camera-based indoor localization and multi-person tracking system implemented on edge computing devices within a large indoor space. To this end, we deployed an end-to-end edge computing pipeline that utilizes multiple cameras to achieve localization, body orientation estimation and tracking of multiple individuals within a large therapeutic space spanning $1700m^2$, all while maintaining a strong focus on preserving privacy. Our pipeline consists of 39 edge computing camera systems equipped with Tensor Processing Units (TPUs) placed in the indoor space&#39;s ceiling. To ensure the privacy of individuals, a real-time multi-person pose estimation algorithm runs on the TPU of the computing camera system. This algorithm extracts poses and bounding boxes, which are utilized for indoor localization, body orientation estimation, and multi-person tracking. Our pipeline demonstrated an average localization error of 1.41 meters, a multiple-object tracking accuracy score of 88.6\%, and a mean absolute body orientation error of 29\degree. These results shows that localization and tracking of individuals in a large indoor space is feasible even with the privacy constrains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05062v2-abstract-full').style.display = 'none'; document.getElementById('2305.05062v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.02997">arXiv:2305.02997</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.02997">pdf</a>, <a href="https://arxiv.org/format/2305.02997">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> When Do Neural Nets Outperform Boosted Trees on Tabular Data? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=McElfresh%2C+D">Duncan McElfresh</a>, <a href="/search/cs?searchtype=author&amp;query=Khandagale%2C+S">Sujay Khandagale</a>, <a href="/search/cs?searchtype=author&amp;query=Valverde%2C+J">Jonathan Valverde</a>, <a href="/search/cs?searchtype=author&amp;query=C%2C+V+P">Vishak Prasad C</a>, <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Ramakrishnan%2C+G">Ganesh Ramakrishnan</a>, <a href="/search/cs?searchtype=author&amp;query=Goldblum%2C+M">Micah Goldblum</a>, <a href="/search/cs?searchtype=author&amp;query=White%2C+C">Colin White</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.02997v4-abstract-short" style="display: inline;"> Tabular data is one of the most commonly used types of data in machine learning. Despite recent advances in neural nets (NNs) for tabular data, there is still an active discussion on whether or not NNs generally outperform gradient-boosted decision trees (GBDTs) on tabular data, with several recent works arguing either that GBDTs consistently outperform NNs on tabular data, or vice versa. In this&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.02997v4-abstract-full').style.display = 'inline'; document.getElementById('2305.02997v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.02997v4-abstract-full" style="display: none;"> Tabular data is one of the most commonly used types of data in machine learning. Despite recent advances in neural nets (NNs) for tabular data, there is still an active discussion on whether or not NNs generally outperform gradient-boosted decision trees (GBDTs) on tabular data, with several recent works arguing either that GBDTs consistently outperform NNs on tabular data, or vice versa. In this work, we take a step back and question the importance of this debate. To this end, we conduct the largest tabular data analysis to date, comparing 19 algorithms across 176 datasets, and we find that the &#39;NN vs. GBDT&#39; debate is overemphasized: for a surprisingly high number of datasets, either the performance difference between GBDTs and NNs is negligible, or light hyperparameter tuning on a GBDT is more important than choosing between NNs and GBDTs. A remarkable exception is the recently-proposed prior-data fitted network, TabPFN: although it is effectively limited to training sets of size 3000, we find that it outperforms all other algorithms on average, even when randomly sampling 3000 training datapoints. Next, we analyze dozens of metafeatures to determine what properties of a dataset make NNs or GBDTs better-suited to perform well. For example, we find that GBDTs are much better than NNs at handling skewed or heavy-tailed feature distributions and other forms of dataset irregularities. Our insights act as a guide for practitioners to determine which techniques may work best on their dataset. Finally, with the goal of accelerating tabular data research, we release the TabZilla Benchmark Suite: a collection of the 36 &#39;hardest&#39; of the datasets we study. Our benchmark suite, codebase, and all raw results are available at https://github.com/naszilla/tabzilla. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.02997v4-abstract-full').style.display = 'none'; document.getElementById('2305.02997v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS Datasets and Benchmarks Track 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.10281">arXiv:2302.10281</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.10281">pdf</a>, <a href="https://arxiv.org/format/2302.10281">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LiT Tuned Models for Efficient Species Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nakkab%2C+A">Andre Nakkab</a>, <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.10281v1-abstract-short" style="display: inline;"> Recent advances in training vision-language models have demonstrated unprecedented robustness and transfer learning effectiveness; however, standard computer vision datasets are image-only, and therefore not well adapted to such training methods. Our paper introduces a simple methodology for adapting any fine-grained image classification dataset for distributed vision-language pretraining. We impl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10281v1-abstract-full').style.display = 'inline'; document.getElementById('2302.10281v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.10281v1-abstract-full" style="display: none;"> Recent advances in training vision-language models have demonstrated unprecedented robustness and transfer learning effectiveness; however, standard computer vision datasets are image-only, and therefore not well adapted to such training methods. Our paper introduces a simple methodology for adapting any fine-grained image classification dataset for distributed vision-language pretraining. We implement this methodology on the challenging iNaturalist-2021 dataset, comprised of approximately 2.7 million images of macro-organisms across 10,000 classes, and achieve a new state-of-the art model in terms of zero-shot classification accuracy. Somewhat surprisingly, our model (trained using a new method called locked-image text tuning) uses a pre-trained, frozen vision representation, proving that language alignment alone can attain strong transfer learning performance, even on fractious, long-tailed datasets. Our approach opens the door for utilizing high quality vision-language pretrained models in agriculturally relevant applications involving species detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10281v1-abstract-full').style.display = 'none'; document.getElementById('2302.10281v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, 1 table, presented at AAAI 2023 conference for the AIAFS workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.12540">arXiv:2301.12540</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.12540">pdf</a>, <a href="https://arxiv.org/format/2301.12540">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Implicit Regularization for Group Sparsity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiangyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+V">Thanh V. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+R+K+W">Raymond K. W. Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.12540v1-abstract-short" style="display: inline;"> We study the implicit regularization of gradient descent towards structured sparsity via a novel neural reparameterization, which we call a diagonally grouped linear neural network. We show the following intriguing property of our reparameterization: gradient descent over the squared regression loss, without any explicit regularization, biases towards solutions with a group sparsity structure. In&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.12540v1-abstract-full').style.display = 'inline'; document.getElementById('2301.12540v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.12540v1-abstract-full" style="display: none;"> We study the implicit regularization of gradient descent towards structured sparsity via a novel neural reparameterization, which we call a diagonally grouped linear neural network. We show the following intriguing property of our reparameterization: gradient descent over the squared regression loss, without any explicit regularization, biases towards solutions with a group sparsity structure. In contrast to many existing works in understanding implicit regularization, we prove that our training trajectory cannot be simulated by mirror descent. We analyze the gradient dynamics of the corresponding regression problem in the general noise setting and obtain minimax-optimal error rates. Compared to existing bounds for implicit sparse regularization using diagonal linear networks, our analysis with the new reparameterization shows improved sample complexity. In the degenerate case of size-one groups, our approach gives rise to a new algorithm for sparse linear regression. Finally, we demonstrate the efficacy of our approach with several numerical experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.12540v1-abstract-full').style.display = 'none'; document.getElementById('2301.12540v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICLR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.06820">arXiv:2301.06820</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.06820">pdf</a>, <a href="https://arxiv.org/format/2301.06820">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Pathfinding Neural Cellular Automata </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Earle%2C+S">Sam Earle</a>, <a href="/search/cs?searchtype=author&amp;query=Yildiz%2C+O">Ozlem Yildiz</a>, <a href="/search/cs?searchtype=author&amp;query=Togelius%2C+J">Julian Togelius</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.06820v1-abstract-short" style="display: inline;"> Pathfinding makes up an important sub-component of a broad range of complex tasks in AI, such as robot path planning, transport routing, and game playing. While classical algorithms can efficiently compute shortest paths, neural networks could be better suited to adapting these sub-routines to more complex and intractable tasks. As a step toward developing such networks, we hand-code and learn mod&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.06820v1-abstract-full').style.display = 'inline'; document.getElementById('2301.06820v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.06820v1-abstract-full" style="display: none;"> Pathfinding makes up an important sub-component of a broad range of complex tasks in AI, such as robot path planning, transport routing, and game playing. While classical algorithms can efficiently compute shortest paths, neural networks could be better suited to adapting these sub-routines to more complex and intractable tasks. As a step toward developing such networks, we hand-code and learn models for Breadth-First Search (BFS), i.e. shortest path finding, using the unified architectural framework of Neural Cellular Automata, which are iterative neural networks with equal-size inputs and outputs. Similarly, we present a neural implementation of Depth-First Search (DFS), and outline how it can be combined with neural BFS to produce an NCA for computing diameter of a graph. We experiment with architectural modifications inspired by these hand-coded NCAs, training networks from scratch to solve the diameter problem on grid mazes while exhibiting strong generalization ability. Finally, we introduce a scheme in which data points are mutated adversarially during training. We find that adversarially evolving mazes leads to increased generalization on out-of-distribution examples, while at the same time generating data-sets with significantly more complex solutions for reasoning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.06820v1-abstract-full').style.display = 'none'; document.getElementById('2301.06820v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.03241">arXiv:2211.03241</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.03241">pdf</a>, <a href="https://arxiv.org/format/2211.03241">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Neural PDE Solvers for Irregular Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Khara%2C+B">Biswajit Khara</a>, <a href="/search/cs?searchtype=author&amp;query=Herron%2C+E">Ethan Herron</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zhanhong Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Balu%2C+A">Aditya Balu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chih-Hsuan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Saurabh%2C+K">Kumar Saurabh</a>, <a href="/search/cs?searchtype=author&amp;query=Jignasu%2C+A">Anushrut Jignasu</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">Soumik Sarkar</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Krishnamurthy%2C+A">Adarsh Krishnamurthy</a>, <a href="/search/cs?searchtype=author&amp;query=Ganapathysubramanian%2C+B">Baskar Ganapathysubramanian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.03241v1-abstract-short" style="display: inline;"> Neural network-based approaches for solving partial differential equations (PDEs) have recently received special attention. However, the large majority of neural PDE solvers only apply to rectilinear domains, and do not systematically address the imposition of Dirichlet/Neumann boundary conditions over irregular domain boundaries. In this paper, we present a framework to neurally solve partial dif&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03241v1-abstract-full').style.display = 'inline'; document.getElementById('2211.03241v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.03241v1-abstract-full" style="display: none;"> Neural network-based approaches for solving partial differential equations (PDEs) have recently received special attention. However, the large majority of neural PDE solvers only apply to rectilinear domains, and do not systematically address the imposition of Dirichlet/Neumann boundary conditions over irregular domain boundaries. In this paper, we present a framework to neurally solve partial differential equations over domains with irregularly shaped (non-rectilinear) geometric boundaries. Our network takes in the shape of the domain as an input (represented using an unstructured point cloud, or any other parametric representation such as Non-Uniform Rational B-Splines) and is able to generalize to novel (unseen) irregular domains; the key technical ingredient to realizing this model is a novel approach for identifying the interior and exterior of the computational grid in a differentiable manner. We also perform a careful error analysis which reveals theoretical insights into several sources of error incurred in the model-building process. Finally, we showcase a wide variety of applications, along with favorable comparisons with ground truth solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03241v1-abstract-full').style.display = 'none'; document.getElementById('2211.03241v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.13601">arXiv:2210.13601</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.13601">pdf</a>, <a href="https://arxiv.org/format/2210.13601">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Active Learning for Single Neuron Models with Lipschitz Non-Linearities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gajjar%2C+A">Aarshvi Gajjar</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Musco%2C+C">Christopher Musco</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.13601v4-abstract-short" style="display: inline;"> We consider the problem of active learning for single neuron models, also sometimes called ``ridge functions&#39;&#39;, in the agnostic setting (under adversarial label noise). Such models have been shown to be broadly effective in modeling physical phenomena, and for constructing surrogate data-driven models for partial differential equations. Surprisingly, we show that for a single neuron model with a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.13601v4-abstract-full').style.display = 'inline'; document.getElementById('2210.13601v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.13601v4-abstract-full" style="display: none;"> We consider the problem of active learning for single neuron models, also sometimes called ``ridge functions&#39;&#39;, in the agnostic setting (under adversarial label noise). Such models have been shown to be broadly effective in modeling physical phenomena, and for constructing surrogate data-driven models for partial differential equations. Surprisingly, we show that for a single neuron model with any Lipschitz non-linearity (such as the ReLU, sigmoid, absolute value, low-degree polynomial, among others), strong provable approximation guarantees can be obtained using a well-known active learning strategy for fitting \emph{linear functions} in the agnostic setting. % -- i.e. for the case when there is no non-linearity. Namely, we can collect samples via statistical \emph{leverage score sampling}, which has been shown to be near-optimal in other active learning scenarios. We support our theoretical results with empirical simulations showing that our proposed active learning strategy based on leverage score sampling outperforms (ordinary) uniform sampling when fitting single neuron models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.13601v4-abstract-full').style.display = 'none'; document.getElementById('2210.13601v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Inadvertently submitting an incorrect writeup that does not align with the intended content</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.07396">arXiv:2210.07396</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.07396">pdf</a>, <a href="https://arxiv.org/format/2210.07396">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Caption supervision enables robust learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+A">Ameya Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.07396v2-abstract-short" style="display: inline;"> Vision language (VL) models like CLIP are robust to natural distribution shifts, in part because CLIP learns on unstructured data using a technique called caption supervision; the model inteprets image-linked texts as ground-truth labels. In a carefully controlled comparison study, we show that caption-supervised CNNs trained on a standard cross-entropy loss (with image labels assigned by scanning&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.07396v2-abstract-full').style.display = 'inline'; document.getElementById('2210.07396v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.07396v2-abstract-full" style="display: none;"> Vision language (VL) models like CLIP are robust to natural distribution shifts, in part because CLIP learns on unstructured data using a technique called caption supervision; the model inteprets image-linked texts as ground-truth labels. In a carefully controlled comparison study, we show that caption-supervised CNNs trained on a standard cross-entropy loss (with image labels assigned by scanning captions for class names) can exhibit greater distributional robustness than VL models trained on the same data. To facilitate future experiments with high-accuracy caption-supervised models, we introduce CaptionNet (https://github.com/penfever/CaptionNet/), which includes a class-balanced, fully supervised dataset with over 50,000 new human-labeled ImageNet-compliant samples which includes web-scraped captions. In a series of experiments on CaptionNet, we show how the choice of loss function, data filtration and supervision strategy enable robust computer vision. We also provide the codebase necessary to reproduce our experiments at VL Hub (https://github.com/penfever/vlhub/). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.07396v2-abstract-full').style.display = 'none'; document.getElementById('2210.07396v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.9 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.06186">arXiv:2210.06186</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.06186">pdf</a>, <a href="https://arxiv.org/format/2210.06186">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GOTCHA: Real-Time Video Deepfake Detection via Challenge-Response </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mittal%2C+G">Govind Mittal</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Memon%2C+N">Nasir Memon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.06186v4-abstract-short" style="display: inline;"> With the rise of AI-enabled Real-Time Deepfakes (RTDFs), the integrity of online video interactions has become a growing concern. RTDFs have now made it feasible to replace an imposter&#39;s face with their victim in live video interactions. Such advancement in deepfakes also coaxes detection to rise to the same standard. However, existing deepfake detection techniques are asynchronous and hence ill-s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06186v4-abstract-full').style.display = 'inline'; document.getElementById('2210.06186v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.06186v4-abstract-full" style="display: none;"> With the rise of AI-enabled Real-Time Deepfakes (RTDFs), the integrity of online video interactions has become a growing concern. RTDFs have now made it feasible to replace an imposter&#39;s face with their victim in live video interactions. Such advancement in deepfakes also coaxes detection to rise to the same standard. However, existing deepfake detection techniques are asynchronous and hence ill-suited for RTDFs. To bridge this gap, we propose a challenge-response approach that establishes authenticity in live settings. We focus on talking-head style video interaction and present a taxonomy of challenges that specifically target inherent limitations of RTDF generation pipelines. We evaluate representative examples from the taxonomy by collecting a unique dataset comprising eight challenges, which consistently and visibly degrades the quality of state-of-the-art deepfake generators. These results are corroborated both by humans and a new automated scoring function, leading to 88.6% and 80.1% AUC, respectively. The findings underscore the promising potential of challenge-response systems for explainable and scalable real-time deepfake detection in practical scenarios. We provide access to data and code at \url{https://github.com/mittalgovind/GOTCHA-Deepfakes}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06186v4-abstract-full').style.display = 'none'; document.getElementById('2210.06186v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE Euro S&amp;P 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.10105">arXiv:2209.10105</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.10105">pdf</a>, <a href="https://arxiv.org/ps/2209.10105">ps</a>, <a href="https://arxiv.org/format/2209.10105">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Distributed Online Non-convex Optimization with Composite Regret </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zhanhong Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Balu%2C+A">Aditya Balu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+X+Y">Xian Yeow Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+Y+M">Young M. Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&amp;query=Sarkar%2C+S">Soumik Sarkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.10105v1-abstract-short" style="display: inline;"> Regret has been widely adopted as the metric of choice for evaluating the performance of online optimization algorithms for distributed, multi-agent systems. However, data/model variations associated with agents can significantly impact decisions and requires consensus among agents. Moreover, most existing works have focused on developing approaches for (either strongly or non-strongly) convex los&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.10105v1-abstract-full').style.display = 'inline'; document.getElementById('2209.10105v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.10105v1-abstract-full" style="display: none;"> Regret has been widely adopted as the metric of choice for evaluating the performance of online optimization algorithms for distributed, multi-agent systems. However, data/model variations associated with agents can significantly impact decisions and requires consensus among agents. Moreover, most existing works have focused on developing approaches for (either strongly or non-strongly) convex losses, and very few results have been obtained regarding regret bounds in distributed online optimization for general non-convex losses. To address these two issues, we propose a novel composite regret with a new network regret-based metric to evaluate distributed online optimization algorithms. We concretely define static and dynamic forms of the composite regret. By leveraging the dynamic form of our composite regret, we develop a consensus-based online normalized gradient (CONGD) approach for pseudo-convex losses, and it provably shows a sublinear behavior relating to a regularity term for the path variation of the optimizer. For general non-convex losses, we first shed light on the regret for the setting of distributed online non-convex learning based on recent advances such that no deterministic algorithm can achieve the sublinear regret. We then develop the distributed online non-convex optimization with composite regret (DINOCO) without access to the gradients, depending on an offline optimization oracle. DINOCO is shown to achieve sublinear regret; to our knowledge, this is the first regret bound for general distributed online non-convex learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.10105v1-abstract-full').style.display = 'none'; document.getElementById('2209.10105v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">41 pages, presented in allerton conference 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.04881">arXiv:2209.04881</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.04881">pdf</a>, <a href="https://arxiv.org/ps/2209.04881">ps</a>, <a href="https://arxiv.org/format/2209.04881">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> </div> </div> <p class="title is-5 mathjax"> On The Computational Complexity of Self-Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Keles%2C+F+D">Feyza Duman Keles</a>, <a href="/search/cs?searchtype=author&amp;query=Wijewardena%2C+P+M">Pruthuvi Mahesakya Wijewardena</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.04881v1-abstract-short" style="display: inline;"> Transformer architectures have led to remarkable progress in many state-of-art applications. However, despite their successes, modern transformers rely on the self-attention mechanism, whose time- and space-complexity is quadratic in the length of the input. Several approaches have been proposed to speed up self-attention mechanisms to achieve sub-quadratic running time; however, the large majorit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.04881v1-abstract-full').style.display = 'inline'; document.getElementById('2209.04881v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.04881v1-abstract-full" style="display: none;"> Transformer architectures have led to remarkable progress in many state-of-art applications. However, despite their successes, modern transformers rely on the self-attention mechanism, whose time- and space-complexity is quadratic in the length of the input. Several approaches have been proposed to speed up self-attention mechanisms to achieve sub-quadratic running time; however, the large majority of these works are not accompanied by rigorous error guarantees. In this work, we establish lower bounds on the computational complexity of self-attention in a number of scenarios. We prove that the time complexity of self-attention is necessarily quadratic in the input length, unless the Strong Exponential Time Hypothesis (SETH) is false. This argument holds even if the attention computation is performed only approximately, and for a variety of attention mechanisms. As a complement to our lower bounds, we show that it is indeed possible to approximate dot-product self-attention using finite Taylor series in linear-time, at the cost of having an exponential dependence on the polynomial order. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.04881v1-abstract-full').style.display = 'none'; document.getElementById('2209.04881v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.08491">arXiv:2206.08491</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.08491">pdf</a>, <a href="https://arxiv.org/format/2206.08491">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Self-Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+M">Minh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+A">Ameya Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.08491v1-abstract-short" style="display: inline;"> Knowledge distillation is the procedure of transferring &#34;knowledge&#34; from a large model (the teacher) to a more compact one (the student), often being used in the context of model compression. When both models have the same architecture, this procedure is called self-distillation. Several works have anecdotally shown that a self-distilled student can outperform the teacher on held-out data. In this&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08491v1-abstract-full').style.display = 'inline'; document.getElementById('2206.08491v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.08491v1-abstract-full" style="display: none;"> Knowledge distillation is the procedure of transferring &#34;knowledge&#34; from a large model (the teacher) to a more compact one (the student), often being used in the context of model compression. When both models have the same architecture, this procedure is called self-distillation. Several works have anecdotally shown that a self-distilled student can outperform the teacher on held-out data. In this work, we systematically study self-distillation in a number of settings. We first show that even with a highly accurate teacher, self-distillation allows a student to surpass the teacher in all cases. Secondly, we revisit existing theoretical explanations of (self) distillation and identify contradicting examples, revealing possible drawbacks of these explanations. Finally, we provide an alternative explanation for the dynamics of self-distillation through the lens of loss landscape geometry. We conduct extensive experiments to show that self-distillation leads to flatter minima, thereby resulting in better generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08491v1-abstract-full').style.display = 'none'; document.getElementById('2206.08491v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.07565">arXiv:2206.07565</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.07565">pdf</a>, <a href="https://arxiv.org/format/2206.07565">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Meta-Analysis of Distributionally-Robust Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feuer%2C+B">Benjamin Feuer</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+A">Ameya Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.07565v1-abstract-short" style="display: inline;"> State-of-the-art image classifiers trained on massive datasets (such as ImageNet) have been shown to be vulnerable to a range of both intentional and incidental distribution shifts. On the other hand, several recent classifiers with favorable out-of-distribution (OOD) robustness properties have emerged, achieving high accuracy on their target tasks while maintaining their in-distribution accuracy&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07565v1-abstract-full').style.display = 'inline'; document.getElementById('2206.07565v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.07565v1-abstract-full" style="display: none;"> State-of-the-art image classifiers trained on massive datasets (such as ImageNet) have been shown to be vulnerable to a range of both intentional and incidental distribution shifts. On the other hand, several recent classifiers with favorable out-of-distribution (OOD) robustness properties have emerged, achieving high accuracy on their target tasks while maintaining their in-distribution accuracy on challenging benchmarks. We present a meta-analysis on a wide range of publicly released models, most of which have been published over the last twelve months. Through this meta-analysis, we empirically identify four main commonalities for all the best-performing OOD-robust models, all of which illuminate the considerable promise of vision-language pre-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07565v1-abstract-full').style.display = 'none'; document.getElementById('2206.07565v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be presented at ICML Workshop on Principles of Distribution Shift 2022. Copyright 2022 by the author(s)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.06154">arXiv:2205.06154</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.06154">pdf</a>, <a href="https://arxiv.org/format/2205.06154">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Smooth-Reduce: Leveraging Patches for Improved Certified Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+A">Ameya Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+M">Minh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Boytsov%2C+L">Leonid Boytsov</a>, <a href="/search/cs?searchtype=author&amp;query=Condessa%2C+F">Filipe Condessa</a>, <a href="/search/cs?searchtype=author&amp;query=Kolter%2C+J+Z">J. Zico Kolter</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.06154v1-abstract-short" style="display: inline;"> Randomized smoothing (RS) has been shown to be a fast, scalable technique for certifying the robustness of deep neural network classifiers. However, methods based on RS require augmenting data with large amounts of noise, which leads to significant drops in accuracy. We propose a training-free, modified smoothing approach, Smooth-Reduce, that leverages patching and aggregation to provide improved&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.06154v1-abstract-full').style.display = 'inline'; document.getElementById('2205.06154v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.06154v1-abstract-full" style="display: none;"> Randomized smoothing (RS) has been shown to be a fast, scalable technique for certifying the robustness of deep neural network classifiers. However, methods based on RS require augmenting data with large amounts of noise, which leads to significant drops in accuracy. We propose a training-free, modified smoothing approach, Smooth-Reduce, that leverages patching and aggregation to provide improved classifier certificates. Our algorithm classifies overlapping patches extracted from an input image, and aggregates the predicted logits to certify a larger radius around the input. We study two aggregation schemes -- max and mean -- and show that both approaches provide better certificates in terms of certified accuracy, average certified radii and abstention rates as compared to concurrent approaches. We also provide theoretical guarantees for such certificates, and empirically show significant improvements over other randomized smoothing methods that require expensive retraining. Further, we extend our approach to videos and provide meaningful certificates for video classifiers. A project page can be found at https://nyu-dice-lab.github.io/SmoothReduce/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.06154v1-abstract-full').style.display = 'none'; document.getElementById('2205.06154v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.02340">arXiv:2202.02340</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.02340">pdf</a>, <a href="https://arxiv.org/format/2202.02340">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Selective Network Linearization for Efficient Private Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+A">Ameya Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Garg%2C+S">Siddharth Garg</a>, <a href="/search/cs?searchtype=author&amp;query=Reagen%2C+B">Brandon Reagen</a>, <a href="/search/cs?searchtype=author&amp;query=Hegde%2C+C">Chinmay Hegde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.02340v2-abstract-short" style="display: inline;"> Private inference (PI) enables inference directly on cryptographically secure data.While promising to address many privacy issues, it has seen limited use due to extreme runtimes. Unlike plaintext inference, where latency is dominated by FLOPs, in PI non-linear functions (namely ReLU) are the bottleneck. Thus, practical PI demands novel ReLU-aware optimizations. To reduce PI latency we propose a g&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.02340v2-abstract-full').style.display = 'inline'; document.getElementById('2202.02340v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.02340v2-abstract-full" style="display: none;"> Private inference (PI) enables inference directly on cryptographically secure data.While promising to address many privacy issues, it has seen limited use due to extreme runtimes. Unlike plaintext inference, where latency is dominated by FLOPs, in PI non-linear functions (namely ReLU) are the bottleneck. Thus, practical PI demands novel ReLU-aware optimizations. To reduce PI latency we propose a gradient-based algorithm that selectively linearizes ReLUs while maintaining prediction accuracy. We evaluate our algorithm on several standard PI benchmarks. The results demonstrate up to $4.25\%$ more accuracy (iso-ReLU count at 50K) or $2.2\times$ less latency (iso-accuracy at 70\%) than the current state of the art and advance the Pareto frontier across the latency-accuracy space. To complement empirical results, we present a &#34;no free lunch&#34; theorem that sheds light on how and when network linearization is possible while maintaining prediction accuracy. Public code is available at \url{https://github.com/NYU-DICE-Lab/selective_network_linearization}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.02340v2-abstract-full').style.display = 'none'; document.getElementById('2202.02340v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in ICML 2022</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Hegde%2C+C&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Hegde%2C+C&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Hegde%2C+C&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10