CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 129 results for author: <span class="mathjax">Baraniuk, R G</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Baraniuk%2C+R+G">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Baraniuk, R G"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Baraniuk%2C+R+G&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Baraniuk, R G"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Baraniuk%2C+R+G&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Baraniuk%2C+R+G&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Baraniuk%2C+R+G&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Baraniuk%2C+R+G&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02609">arXiv:2411.02609</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02609">pdf</a>, <a href="https://arxiv.org/format/2411.02609">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Estimating the Number and Locations of Boundaries in Reverberant Environments with Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Arikan%2C+T">Toros Arikan</a>, <a href="/search/cs?searchtype=author&amp;query=Chackalackal%2C+L+M">Luca M. Chackalackal</a>, <a href="/search/cs?searchtype=author&amp;query=Ahsan%2C+F">Fatima Ahsan</a>, <a href="/search/cs?searchtype=author&amp;query=Tittel%2C+K">Konrad Tittel</a>, <a href="/search/cs?searchtype=author&amp;query=Singer%2C+A+C">Andrew C. Singer</a>, <a href="/search/cs?searchtype=author&amp;query=Wornell%2C+G+W">Gregory W. Wornell</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02609v1-abstract-short" style="display: inline;"> Underwater acoustic environment estimation is a challenging but important task for remote sensing scenarios. Current estimation methods require high signal strength and a solution to the fragile echo labeling problem to be effective. In previous publications, we proposed a general deep learning-based method for two-dimensional environment estimation which outperformed the state-of-the-art, both in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02609v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02609v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02609v1-abstract-full" style="display: none;"> Underwater acoustic environment estimation is a challenging but important task for remote sensing scenarios. Current estimation methods require high signal strength and a solution to the fragile echo labeling problem to be effective. In previous publications, we proposed a general deep learning-based method for two-dimensional environment estimation which outperformed the state-of-the-art, both in simulation and in real-life experimental settings. A limitation of this method was that some prior information had to be provided by the user on the number and locations of the reflective boundaries, and that its neural networks had to be re-trained accordingly for different environments. Utilizing more advanced neural network and time delay estimation techniques, the proposed improved method no longer requires prior knowledge the number of boundaries or their locations, and is able to estimate two-dimensional environments with one or two boundaries. Future work will extend the proposed method to more boundaries and larger-scale environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02609v1-abstract-full').style.display = 'none'; document.getElementById('2411.02609v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18832">arXiv:2410.18832</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18832">pdf</a>, <a href="https://arxiv.org/format/2410.18832">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MazeNet: An Accurate, Fast, and Scalable Deep Learning Solution for Steiner Minimum Trees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ramos%2C+G+D">Gabriel D铆az Ramos</a>, <a href="/search/cs?searchtype=author&amp;query=Arikan%2C+T">Toros Arikan</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18832v1-abstract-short" style="display: inline;"> The Obstacle Avoiding Rectilinear Steiner Minimum Tree (OARSMT) problem, which seeks the shortest interconnection of a given number of terminals in a rectilinear plane while avoiding obstacles, is a critical task in integrated circuit design, network optimization, and robot path planning. Since OARSMT is NP-hard, exact algorithms scale poorly with the number of terminals, leading practical solvers&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18832v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18832v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18832v1-abstract-full" style="display: none;"> The Obstacle Avoiding Rectilinear Steiner Minimum Tree (OARSMT) problem, which seeks the shortest interconnection of a given number of terminals in a rectilinear plane while avoiding obstacles, is a critical task in integrated circuit design, network optimization, and robot path planning. Since OARSMT is NP-hard, exact algorithms scale poorly with the number of terminals, leading practical solvers to sacrifice accuracy for large problems. We propose MazeNet, a deep learning-based method that learns to solve the OARSMT from data. MazeNet reframes OARSMT as a maze-solving task that can be addressed with a recurrent convolutional neural network (RCNN). A key hallmark of MazeNet is its scalability: we only need to train the RCNN blocks on mazes with a small number of terminals; larger mazes can be solved by replicating the same pre-trained blocks to create a larger network. Across a wide range of experiments, MazeNet achieves perfect OARSMT-solving accuracy, significantly reduces runtime compared to classical exact algorithms, and can handle more terminals than state-of-the-art approximate algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18832v1-abstract-full').style.display = 'none'; document.getElementById('2410.18832v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 15 figures. Submitted to ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12294">arXiv:2410.12294</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.12294">pdf</a>, <a href="https://arxiv.org/format/2410.12294">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LLM-based Cognitive Models of Students with Misconceptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinghe Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Sachan%2C+M">Mrinmaya Sachan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12294v2-abstract-short" style="display: inline;"> Accurately modeling student cognition is crucial for developing effective AI-driven educational technologies. A key challenge is creating realistic student models that satisfy two essential properties: (1) accurately replicating specific misconceptions, and (2) correctly solving problems where these misconceptions are not applicable. This dual requirement reflects the complex nature of student und&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12294v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12294v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12294v2-abstract-full" style="display: none;"> Accurately modeling student cognition is crucial for developing effective AI-driven educational technologies. A key challenge is creating realistic student models that satisfy two essential properties: (1) accurately replicating specific misconceptions, and (2) correctly solving problems where these misconceptions are not applicable. This dual requirement reflects the complex nature of student understanding, where misconceptions coexist with correct knowledge. This paper investigates whether Large Language Models (LLMs) can be instruction-tuned to meet this dual requirement and effectively simulate student thinking in algebra. We introduce MalAlgoPy, a novel Python library that generates datasets reflecting authentic student solution patterns through a graph-based representation of algebraic problem-solving. Utilizing MalAlgoPy, we define and examine Cognitive Student Models (CSMs) - LLMs instruction tuned to faithfully emulate realistic student behavior. Our findings reveal that LLMs trained on misconception examples can efficiently learn to replicate errors. However, the training diminishes the model&#39;s ability to solve problems correctly, particularly for problem types where the misconceptions are not applicable, thus failing to satisfy second property of CSMs. We demonstrate that by carefully calibrating the ratio of correct to misconception examples in the training data - sometimes as low as 0.25 - it is possible to develop CSMs that satisfy both properties. Our insights enhance our understanding of AI-based student models and pave the way for effective adaptive learning systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12294v2-abstract-full').style.display = 'none'; document.getElementById('2410.12294v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09566">arXiv:2409.09566</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.09566">pdf</a>, <a href="https://arxiv.org/format/2409.09566">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning Transferable Features for Implicit Neural Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vyas%2C+K">Kushal Vyas</a>, <a href="/search/cs?searchtype=author&amp;query=Humayun%2C+A+I">Ahmed Imtiaz Humayun</a>, <a href="/search/cs?searchtype=author&amp;query=Dashpute%2C+A">Aniket Dashpute</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Veeraraghavan%2C+A">Ashok Veeraraghavan</a>, <a href="/search/cs?searchtype=author&amp;query=Balakrishnan%2C+G">Guha Balakrishnan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09566v1-abstract-short" style="display: inline;"> Implicit neural representations (INRs) have demonstrated success in a variety of applications, including inverse problems and neural rendering. An INR is typically trained to capture one signal of interest, resulting in learned neural features that are highly attuned to that signal. Assumed to be less generalizable, we explore the aspect of transferability of such learned neural features for fitti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09566v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09566v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09566v1-abstract-full" style="display: none;"> Implicit neural representations (INRs) have demonstrated success in a variety of applications, including inverse problems and neural rendering. An INR is typically trained to capture one signal of interest, resulting in learned neural features that are highly attuned to that signal. Assumed to be less generalizable, we explore the aspect of transferability of such learned neural features for fitting similar signals. We introduce a new INR training framework, STRAINER that learns transferrable features for fitting INRs to new signals from a given distribution, faster and with better reconstruction quality. Owing to the sequential layer-wise affine operations in an INR, we propose to learn transferable representations by sharing initial encoder layers across multiple INRs with independent decoder layers. At test time, the learned encoder representations are transferred as initialization for an otherwise randomly initialized INR. We find STRAINER to yield extremely powerful initialization for fitting images from the same domain and allow for $\approx +10dB$ gain in signal quality early on compared to an untrained INR itself. STRAINER also provides a simple way to encode data-driven priors in INRs. We evaluate STRAINER on multiple in-domain and out-of-domain signal fitting tasks and inverse problems and further provide detailed analysis and discussion on the transferability of STRAINER&#39;s features. Our demo can be accessed at https://colab.research.google.com/drive/1fBZAwqE8C_lrRPAe-hQZJTWrMJuAKtG2?usp=sharing . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09566v1-abstract-full').style.display = 'none'; document.getElementById('2409.09566v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13781">arXiv:2406.13781</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13781">pdf</a>, <a href="https://arxiv.org/format/2406.13781">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A Primal-Dual Framework for Transformers and Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+M">Tan M. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Tam Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Ho%2C+N">Nhat Ho</a>, <a href="/search/cs?searchtype=author&amp;query=Bertozzi%2C+A+L">Andrea L. Bertozzi</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Osher%2C+S+J">Stanley J. Osher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13781v1-abstract-short" style="display: inline;"> Self-attention is key to the remarkable success of transformers in sequence modeling tasks including many applications in natural language processing and computer vision. Like neural network layers, these attention mechanisms are often developed by heuristics and experience. To provide a principled framework for constructing attention layers in transformers, we show that the self-attention corresp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13781v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13781v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13781v1-abstract-full" style="display: none;"> Self-attention is key to the remarkable success of transformers in sequence modeling tasks including many applications in natural language processing and computer vision. Like neural network layers, these attention mechanisms are often developed by heuristics and experience. To provide a principled framework for constructing attention layers in transformers, we show that the self-attention corresponds to the support vector expansion derived from a support vector regression problem, whose primal formulation has the form of a neural network layer. Using our framework, we derive popular attention layers used in practice and propose two new attentions: 1) the Batch Normalized Attention (Attention-BN) derived from the batch normalization layer and 2) the Attention with Scaled Head (Attention-SH) derived from using less training data to fit the SVR model. We empirically demonstrate the advantages of the Attention-BN and Attention-SH in reducing head redundancy, increasing the model&#39;s accuracy, and improving the model&#39;s efficiency in a variety of practical applications including image and time-series classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13781v1-abstract-full').style.display = 'none'; document.getElementById('2406.13781v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2023, 26 pages, 4 figures, 14 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13977">arXiv:2405.13977</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.13977">pdf</a>, <a href="https://arxiv.org/ps/2405.13977">ps</a>, <a href="https://arxiv.org/format/2405.13977">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Improving Fairness and Mitigating MADness in Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mayer%2C+P">Paul Mayer</a>, <a href="/search/cs?searchtype=author&amp;query=Luzi%2C+L">Lorenzo Luzi</a>, <a href="/search/cs?searchtype=author&amp;query=Siahkoohi%2C+A">Ali Siahkoohi</a>, <a href="/search/cs?searchtype=author&amp;query=Johnson%2C+D+H">Don H. Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13977v3-abstract-short" style="display: inline;"> Generative models unfairly penalize data belonging to minority classes, suffer from model autophagy disorder (MADness), and learn biased estimates of the underlying distribution parameters. Our theoretical and empirical results show that training generative models with intentionally designed hypernetworks leads to models that 1) are more fair when generating datapoints belonging to minority classe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13977v3-abstract-full').style.display = 'inline'; document.getElementById('2405.13977v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13977v3-abstract-full" style="display: none;"> Generative models unfairly penalize data belonging to minority classes, suffer from model autophagy disorder (MADness), and learn biased estimates of the underlying distribution parameters. Our theoretical and empirical results show that training generative models with intentionally designed hypernetworks leads to models that 1) are more fair when generating datapoints belonging to minority classes 2) are more stable in a self-consumed (i.e., MAD) setting, and 3) learn parameters that are less statistically biased. To further mitigate unfairness, MADness, and bias, we introduce a regularization term that penalizes discrepancies between a generative model&#39;s estimated weights when trained on real data versus its own synthetic data. To facilitate training existing deep generative models within our framework, we offer a scalable implementation of hypernetworks that automatically generates a hypernetwork architecture for any given generative model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13977v3-abstract-full').style.display = 'none'; document.getElementById('2405.13977v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.08134">arXiv:2405.08134</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.08134">pdf</a>, <a href="https://arxiv.org/format/2405.08134">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Many-Shot Regurgitation (MSR) Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.08134v1-abstract-short" style="display: inline;"> We introduce Many-Shot Regurgitation (MSR) prompting, a new black-box membership inference attack framework for examining verbatim content reproduction in large language models (LLMs). MSR prompting involves dividing the input text into multiple segments and creating a single prompt that includes a series of faux conversation rounds between a user and a language model to elicit verbatim regurgitat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.08134v1-abstract-full').style.display = 'inline'; document.getElementById('2405.08134v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.08134v1-abstract-full" style="display: none;"> We introduce Many-Shot Regurgitation (MSR) prompting, a new black-box membership inference attack framework for examining verbatim content reproduction in large language models (LLMs). MSR prompting involves dividing the input text into multiple segments and creating a single prompt that includes a series of faux conversation rounds between a user and a language model to elicit verbatim regurgitation. We apply MSR prompting to diverse text sources, including Wikipedia articles and open educational resources (OER) textbooks, which provide high-quality, factual content and are continuously updated over time. For each source, we curate two dataset types: one that LLMs were likely exposed to during training ($D_{\rm pre}$) and another consisting of documents published after the models&#39; training cutoff dates ($D_{\rm post}$). To quantify the occurrence of verbatim matches, we employ the Longest Common Substring algorithm and count the frequency of matches at different length thresholds. We then use statistical measures such as Cliff&#39;s delta, Kolmogorov-Smirnov (KS) distance, and Kruskal-Wallis H test to determine whether the distribution of verbatim matches differs significantly between $D_{\rm pre}$ and $D_{\rm post}$. Our findings reveal a striking difference in the distribution of verbatim matches between $D_{\rm pre}$ and $D_{\rm post}$, with the frequency of verbatim reproduction being significantly higher when LLMs (e.g. GPT models and LLaMAs) are prompted with text from datasets they were likely trained on. For instance, when using GPT-3.5 on Wikipedia articles, we observe a substantial effect size (Cliff&#39;s delta $= -0.984$) and a large KS distance ($0.875$) between the distributions of $D_{\rm pre}$ and $D_{\rm post}$. Our results provide compelling evidence that LLMs are more prone to reproducing verbatim content when the input text is likely sourced from their training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.08134v1-abstract-full').style.display = 'none'; document.getElementById('2405.08134v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15156">arXiv:2404.15156</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.15156">pdf</a>, <a href="https://arxiv.org/format/2404.15156">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Student Data Paradox and Curious Case of Single Student-Tutor Model: Regressive Side Effects of Training LLMs for Personalized Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15156v2-abstract-short" style="display: inline;"> The pursuit of personalized education has led to the integration of Large Language Models (LLMs) in developing intelligent tutoring systems. To better understand and adapt to individual student needs, including their misconceptions, LLMs need to be trained on extensive datasets of student-tutor dialogues. Our research uncovers a fundamental challenge in this approach: the ``Student Data Paradox.&#39;&#39;&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15156v2-abstract-full').style.display = 'inline'; document.getElementById('2404.15156v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15156v2-abstract-full" style="display: none;"> The pursuit of personalized education has led to the integration of Large Language Models (LLMs) in developing intelligent tutoring systems. To better understand and adapt to individual student needs, including their misconceptions, LLMs need to be trained on extensive datasets of student-tutor dialogues. Our research uncovers a fundamental challenge in this approach: the ``Student Data Paradox.&#39;&#39; This paradox emerges when LLMs, trained on student data to understand learner behavior, inadvertently compromise their own factual knowledge and reasoning abilities. We investigate this paradox by training state-of-the-art language models on student-tutor dialogue datasets and evaluating their performance across multiple benchmarks. These benchmarks assess various aspects of language model capabilities, including reasoning, truthfulness, and common sense understanding. Our findings reveal significant declines in the models&#39; performance across these diverse benchmarks, indicating a broad impact on their capabilities when trained to model student behavior. Our research makes two primary contributions: (1) empirical demonstration of the Student Data Paradox through quantitative analysis of model performance, and (2) introduction of ``hallucination tokens&#39;&#39; as a mitigation strategy. These tokens, while improving performance, highlight the persistent challenge of balancing accurate student behavior modeling with maintaining the LLM&#39;s integrity as an educational tool. This study emphasizes the need for innovative solutions to reconcile the conflicting goals of faithfully understanding diverse student cognition while preserving the model&#39;s ability to provide accurate information and guidance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15156v2-abstract-full').style.display = 'none'; document.getElementById('2404.15156v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2024 Findings Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14316">arXiv:2404.14316</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.14316">pdf</a>, <a href="https://arxiv.org/format/2404.14316">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Automated Long Answer Grading with RiceChem Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+K">Kangqi Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+L+T">Lesa Tran Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Kincaid%2C+K">Kristi Kincaid</a>, <a href="/search/cs?searchtype=author&amp;query=Hutchinson%2C+J+S">John S. Hutchinson</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14316v1-abstract-short" style="display: inline;"> We introduce a new area of study in the field of educational Natural Language Processing: Automated Long Answer Grading (ALAG). Distinguishing itself from Automated Short Answer Grading (ASAG) and Automated Essay Grading (AEG), ALAG presents unique challenges due to the complexity and multifaceted nature of fact-based long answers. To study ALAG, we introduce RiceChem, a dataset derived from a col&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14316v1-abstract-full').style.display = 'inline'; document.getElementById('2404.14316v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14316v1-abstract-full" style="display: none;"> We introduce a new area of study in the field of educational Natural Language Processing: Automated Long Answer Grading (ALAG). Distinguishing itself from Automated Short Answer Grading (ASAG) and Automated Essay Grading (AEG), ALAG presents unique challenges due to the complexity and multifaceted nature of fact-based long answers. To study ALAG, we introduce RiceChem, a dataset derived from a college chemistry course, featuring real student responses to long-answer questions with an average word count notably higher than typical ASAG datasets. We propose a novel approach to ALAG by formulating it as a rubric entailment problem, employing natural language inference models to verify whether each criterion, represented by a rubric item, is addressed in the student&#39;s response. This formulation enables the effective use of MNLI for transfer learning, significantly improving the performance of models on the RiceChem dataset. We demonstrate the importance of rubric-based formulation in ALAG, showcasing its superiority over traditional score-based approaches in capturing the nuances of student responses. We also investigate the performance of models in cold start scenarios, providing valuable insights into the practical deployment considerations in educational settings. Lastly, we benchmark state-of-the-art open-sourced Large Language Models (LLMs) on RiceChem and compare their results to GPT models, highlighting the increased complexity of ALAG compared to ASAG. Despite leveraging the benefits of a rubric-based approach and transfer learning from MNLI, the lower performance of LLMs on RiceChem underscores the significant difficulty posed by the ALAG task. With this work, we offer a fresh perspective on grading long, fact-based answers and introduce a new dataset to stimulate further research in this important area. Code: \url{https://github.com/luffycodes/Automated-Long-Answer-Grading}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14316v1-abstract-full').style.display = 'none'; document.getElementById('2404.14316v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14301">arXiv:2404.14301</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.14301">pdf</a>, <a href="https://arxiv.org/format/2404.14301">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Marking: Visual Grading with Highlighting Errors and Annotating Missing Bits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Mallick%2C+D+B">Debshila B. Mallick</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14301v1-abstract-short" style="display: inline;"> In this paper, we introduce &#34;Marking&#34;, a novel grading task that enhances automated grading systems by performing an in-depth analysis of student responses and providing students with visual highlights. Unlike traditional systems that provide binary scores, &#34;marking&#34; identifies and categorizes segments of the student response as correct, incorrect, or irrelevant and detects omissions from gold ans&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14301v1-abstract-full').style.display = 'inline'; document.getElementById('2404.14301v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14301v1-abstract-full" style="display: none;"> In this paper, we introduce &#34;Marking&#34;, a novel grading task that enhances automated grading systems by performing an in-depth analysis of student responses and providing students with visual highlights. Unlike traditional systems that provide binary scores, &#34;marking&#34; identifies and categorizes segments of the student response as correct, incorrect, or irrelevant and detects omissions from gold answers. We introduce a new dataset meticulously curated by Subject Matter Experts specifically for this task. We frame &#34;Marking&#34; as an extension of the Natural Language Inference (NLI) task, which is extensively explored in the field of Natural Language Processing. The gold answer and the student response play the roles of premise and hypothesis in NLI, respectively. We subsequently train language models to identify entailment, contradiction, and neutrality from student response, akin to NLI, and with the added dimension of identifying omissions from gold answers. Our experimental setup involves the use of transformer models, specifically BERT and RoBERTa, and an intelligent training step using the e-SNLI dataset. We present extensive baseline results highlighting the complexity of the &#34;Marking&#34; task, which sets a clear trajectory for the upcoming study. Our work not only opens up new avenues for research in AI-powered educational assessment tools, but also provides a valuable benchmark for the AI in education community to engage with and improve upon in the future. The code and dataset can be found at https://github.com/luffycodes/marking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14301v1-abstract-full').style.display = 'none'; document.getElementById('2404.14301v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.15989">arXiv:2402.15989</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.15989">pdf</a>, <a href="https://arxiv.org/format/2402.15989">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> PIDformer: Transformer Meets Control Theory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Tam Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Uribe%2C+C+A">C茅sar A. Uribe</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+M">Tan M. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.15989v1-abstract-short" style="display: inline;"> In this work, we address two main shortcomings of transformer architectures: input corruption and rank collapse in their output representation. We unveil self-attention as an autonomous state-space model that inherently promotes smoothness in its solutions, leading to lower-rank outputs and diminished representation capacity. Moreover, the steady-state solution of the model is sensitive to input p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15989v1-abstract-full').style.display = 'inline'; document.getElementById('2402.15989v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.15989v1-abstract-full" style="display: none;"> In this work, we address two main shortcomings of transformer architectures: input corruption and rank collapse in their output representation. We unveil self-attention as an autonomous state-space model that inherently promotes smoothness in its solutions, leading to lower-rank outputs and diminished representation capacity. Moreover, the steady-state solution of the model is sensitive to input perturbations. We incorporate a Proportional-Integral-Derivative (PID) closed-loop feedback control system with a reference point into the model to improve robustness and representation capacity. This integration aims to preserve high-frequency details while bolstering model stability, rendering it more noise-resilient. The resulting controlled state-space model is theoretically proven robust and adept at addressing the rank collapse. Motivated by this control framework, we derive a novel class of transformers, PID-controlled Transformer (PIDformer), aimed at improving robustness and mitigating the rank-collapse issue inherent in softmax transformers. We empirically evaluate the model for advantages and robustness against baseline transformers across various practical tasks, including object classification, image segmentation, and language modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15989v1-abstract-full').style.display = 'none'; document.getElementById('2402.15989v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05000">arXiv:2402.05000</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.05000">pdf</a>, <a href="https://arxiv.org/format/2402.05000">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Pedagogical Alignment of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+K">Kangqi Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhary%2C+S">Sapana Chaudhary</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05000v3-abstract-short" style="display: inline;"> Large Language Models (LLMs), when used in educational settings without pedagogical fine-tuning, often provide immediate answers rather than guiding students through the problem-solving process. This approach falls short of pedagogically best practices and limits their effectiveness as educational tools. We term the objective of training LLMs to emulate effective teaching strategies as `pedagogica&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05000v3-abstract-full').style.display = 'inline'; document.getElementById('2402.05000v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05000v3-abstract-full" style="display: none;"> Large Language Models (LLMs), when used in educational settings without pedagogical fine-tuning, often provide immediate answers rather than guiding students through the problem-solving process. This approach falls short of pedagogically best practices and limits their effectiveness as educational tools. We term the objective of training LLMs to emulate effective teaching strategies as `pedagogical alignment.&#39; In this paper, we investigate Learning from Human Preferences (LHP) algorithms to achieve this alignment objective. A key challenge in this process is the scarcity of high-quality preference datasets to guide the alignment. To address this, we propose a novel approach for constructing a large-scale dataset using synthetic data generation techniques, eliminating the need for time-consuming and costly manual annotation. Leveraging this dataset, our experiments with Llama and Mistral models demonstrate that LHP methods outperform standard supervised fine-tuning (SFT), improving pedagogical alignment accuracy by 13.1% and 8.7% respectively. Existing evaluation methods also lack quantitative metrics to adequately measure the pedagogical alignment of LLMs. To address this gap, we propose novel perplexity-based metrics that quantify LLMs&#39; tendency to provide scaffolded guidance versus direct answers, offering a robust measure of pedagogical alignment. Our analysis provides compelling evidence for the superiority of LHP methods over SFT in optimizing LLMs&#39; behavior, underscoring the potential of LHP methods in better aligning LLMs with educational objectives and fostering effective learning experiences. Code and models are available \href{https://github.com/luffycodes/Tutorbot-Spock}{here}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05000v3-abstract-full').style.display = 'none'; document.getElementById('2402.05000v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2024 Findings Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.14429">arXiv:2401.14429</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.14429">pdf</a>, <a href="https://arxiv.org/ps/2401.14429">ps</a>, <a href="https://arxiv.org/format/2401.14429">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> [Re] The Discriminative Kalman Filter for Bayesian Filtering with Nonlinear and Non-Gaussian Observation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Casco-Rodriguez%2C+J">Josue Casco-Rodriguez</a>, <a href="/search/cs?searchtype=author&amp;query=Kemere%2C+C">Caleb Kemere</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.14429v1-abstract-short" style="display: inline;"> Kalman filters provide a straightforward and interpretable means to estimate hidden or latent variables, and have found numerous applications in control, robotics, signal processing, and machine learning. One such application is neural decoding for neuroprostheses. In 2020, Burkhart et al. thoroughly evaluated their new version of the Kalman filter that leverages Bayes&#39; theorem to improve filter p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14429v1-abstract-full').style.display = 'inline'; document.getElementById('2401.14429v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.14429v1-abstract-full" style="display: none;"> Kalman filters provide a straightforward and interpretable means to estimate hidden or latent variables, and have found numerous applications in control, robotics, signal processing, and machine learning. One such application is neural decoding for neuroprostheses. In 2020, Burkhart et al. thoroughly evaluated their new version of the Kalman filter that leverages Bayes&#39; theorem to improve filter performance for highly non-linear or non-Gaussian observation models. This work provides an open-source Python alternative to the authors&#39; MATLAB algorithm. Specifically, we reproduce their most salient results for neuroscientific contexts and further examine the efficacy of their filter using multiple random seeds and previously unused trials from the authors&#39; dataset. All experiments were performed offline on a single computer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14429v1-abstract-full').style.display = 'none'; document.getElementById('2401.14429v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.00751">arXiv:2312.00751</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.00751">pdf</a>, <a href="https://arxiv.org/format/2312.00751">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Over-smoothing in Transformers via Regularized Nonlocal Functionals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Tam Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+M">Tan M. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.00751v1-abstract-short" style="display: inline;"> Transformers have achieved remarkable success in a wide range of natural language processing and computer vision applications. However, the representation capacity of a deep transformer model is degraded due to the over-smoothing issue in which the token representations become identical when the model&#39;s depth grows. In this work, we show that self-attention layers in transformers minimize a functi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00751v1-abstract-full').style.display = 'inline'; document.getElementById('2312.00751v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.00751v1-abstract-full" style="display: none;"> Transformers have achieved remarkable success in a wide range of natural language processing and computer vision applications. However, the representation capacity of a deep transformer model is degraded due to the over-smoothing issue in which the token representations become identical when the model&#39;s depth grows. In this work, we show that self-attention layers in transformers minimize a functional which promotes smoothness, thereby causing token uniformity. We then propose a novel regularizer that penalizes the norm of the difference between the smooth output tokens from self-attention and the input tokens to preserve the fidelity of the tokens. Minimizing the resulting regularized energy functional, we derive the Neural Transformer with a Regularized Nonlocal Functional (NeuTRENO), a novel class of transformer models that can mitigate the over-smoothing issue. We empirically demonstrate the advantages of NeuTRENO over the baseline transformers and state-of-the-art methods in reducing the over-smoothing of token representations on various practical tasks, including object classification, image segmentation, and language modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00751v1-abstract-full').style.display = 'none'; document.getElementById('2312.00751v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 papes</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.02439">arXiv:2310.02439</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.02439">pdf</a>, <a href="https://arxiv.org/format/2310.02439">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Novice Learner and Expert Tutor: Evaluating Math Reasoning Abilities of Large Language Models with Misconceptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zichao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Woodhead%2C+S">Simon Woodhead</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.02439v1-abstract-short" style="display: inline;"> We propose novel evaluations for mathematical reasoning capabilities of Large Language Models (LLMs) based on mathematical misconceptions. Our primary approach is to simulate LLMs as a novice learner and an expert tutor, aiming to identify the incorrect answer to math question resulted from a specific misconception and to recognize the misconception(s) behind an incorrect answer, respectively. Con&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02439v1-abstract-full').style.display = 'inline'; document.getElementById('2310.02439v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.02439v1-abstract-full" style="display: none;"> We propose novel evaluations for mathematical reasoning capabilities of Large Language Models (LLMs) based on mathematical misconceptions. Our primary approach is to simulate LLMs as a novice learner and an expert tutor, aiming to identify the incorrect answer to math question resulted from a specific misconception and to recognize the misconception(s) behind an incorrect answer, respectively. Contrary to traditional LLMs-based mathematical evaluations that focus on answering math questions correctly, our approach takes inspirations from principles in educational learning sciences. We explicitly ask LLMs to mimic a novice learner by answering questions in a specific incorrect manner based on incomplete knowledge; and to mimic an expert tutor by identifying misconception(s) corresponding to an incorrect answer to a question. Using simple grade-school math problems, our experiments reveal that, while LLMs can easily answer these questions correctly, they struggle to identify 1) the incorrect answer corresponding to specific incomplete knowledge (misconceptions); 2) the misconceptions that explain particular incorrect answers. Our study indicates new opportunities for enhancing LLMs&#39; math reasoning capabilities, especially on developing robust student simulation and expert tutoring models in the educational applications such as intelligent tutoring systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02439v1-abstract-full').style.display = 'none'; document.getElementById('2310.02439v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.00545">arXiv:2310.00545</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.00545">pdf</a>, <a href="https://arxiv.org/format/2310.00545">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Implicit Neural Representations and the Algebra of Complex Wavelets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Roddenberry%2C+T+M">T. Mitchell Roddenberry</a>, <a href="/search/cs?searchtype=author&amp;query=Saragadam%2C+V">Vishwanath Saragadam</a>, <a href="/search/cs?searchtype=author&amp;query=de+Hoop%2C+M+V">Maarten V. de Hoop</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.00545v1-abstract-short" style="display: inline;"> Implicit neural representations (INRs) have arisen as useful methods for representing signals on Euclidean domains. By parameterizing an image as a multilayer perceptron (MLP) on Euclidean space, INRs effectively represent signals in a way that couples spatial and spectral features of the signal that is not obvious in the usual discrete representation, paving the way for continuous signal processi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00545v1-abstract-full').style.display = 'inline'; document.getElementById('2310.00545v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.00545v1-abstract-full" style="display: none;"> Implicit neural representations (INRs) have arisen as useful methods for representing signals on Euclidean domains. By parameterizing an image as a multilayer perceptron (MLP) on Euclidean space, INRs effectively represent signals in a way that couples spatial and spectral features of the signal that is not obvious in the usual discrete representation, paving the way for continuous signal processing and machine learning approaches that were not previously possible. Although INRs using sinusoidal activation functions have been studied in terms of Fourier theory, recent works have shown the advantage of using wavelets instead of sinusoids as activation functions, due to their ability to simultaneously localize in both frequency and space. In this work, we approach such INRs and demonstrate how they resolve high-frequency features of signals from coarse approximations done in the first layer of the MLP. This leads to multiple prescriptions for the design of INR architectures, including the use of complex wavelets, decoupling of low and band-pass approximations, and initialization schemes based on the singularities of the desired signal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00545v1-abstract-full').style.display = 'none'; document.getElementById('2310.00545v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures. 2 appendix pages, 1 appendix figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.12161">arXiv:2309.12161</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.12161">pdf</a>, <a href="https://arxiv.org/format/2309.12161">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Code Soliloquies for Accurate Calculations in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+M">MyCo Le</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinghe Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Mallick%2C+D+B">Debshila Basu Mallick</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.12161v2-abstract-short" style="display: inline;"> High-quality conversational datasets are crucial for the successful development of Intelligent Tutoring Systems (ITS) that utilize a Large Language Model (LLM) backend. Synthetic student-teacher dialogues, generated using advanced GPT-4 models, are a common strategy for creating these datasets. However, subjects like physics that entail complex calculations pose a challenge. While GPT-4 presents i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.12161v2-abstract-full').style.display = 'inline'; document.getElementById('2309.12161v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.12161v2-abstract-full" style="display: none;"> High-quality conversational datasets are crucial for the successful development of Intelligent Tutoring Systems (ITS) that utilize a Large Language Model (LLM) backend. Synthetic student-teacher dialogues, generated using advanced GPT-4 models, are a common strategy for creating these datasets. However, subjects like physics that entail complex calculations pose a challenge. While GPT-4 presents impressive language processing capabilities, its limitations in fundamental mathematical reasoning curtail its efficacy for such subjects. To tackle this limitation, we introduce in this paper an innovative stateful prompt design. Our design orchestrates a mock conversation where both student and tutorbot roles are simulated by GPT-4. Each student response triggers an internal monologue, or `code soliloquy&#39; in the GPT-tutorbot, which assesses whether its subsequent response would necessitate calculations. If a calculation is deemed necessary, it scripts the relevant Python code and uses the Python output to construct a response to the student. Our approach notably enhances the quality of synthetic conversation datasets, especially for subjects that are calculation-intensive. Our preliminary Subject Matter Expert evaluations reveal that our Higgs model, a fine-tuned LLaMA model, effectively uses Python for computations, which significantly enhances the accuracy and computational reliability of Higgs&#39; responses. Code, models, and datasets is available at https://github.com/luffycodes/Tutorbot-Spock-Phys. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.12161v2-abstract-full').style.display = 'none'; document.getElementById('2309.12161v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.01850">arXiv:2307.01850</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.01850">pdf</a>, <a href="https://arxiv.org/format/2307.01850">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-Consuming Generative Models Go MAD </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alemohammad%2C+S">Sina Alemohammad</a>, <a href="/search/cs?searchtype=author&amp;query=Casco-Rodriguez%2C+J">Josue Casco-Rodriguez</a>, <a href="/search/cs?searchtype=author&amp;query=Luzi%2C+L">Lorenzo Luzi</a>, <a href="/search/cs?searchtype=author&amp;query=Humayun%2C+A+I">Ahmed Imtiaz Humayun</a>, <a href="/search/cs?searchtype=author&amp;query=Babaei%2C+H">Hossein Babaei</a>, <a href="/search/cs?searchtype=author&amp;query=LeJeune%2C+D">Daniel LeJeune</a>, <a href="/search/cs?searchtype=author&amp;query=Siahkoohi%2C+A">Ali Siahkoohi</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.01850v1-abstract-short" style="display: inline;"> Seismic advances in generative AI algorithms for imagery, text, and other data types has led to the temptation to use synthetic data to train next-generation models. Repeating this process creates an autophagous (self-consuming) loop whose properties are poorly understood. We conduct a thorough analytical and empirical analysis using state-of-the-art generative image models of three families of au&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.01850v1-abstract-full').style.display = 'inline'; document.getElementById('2307.01850v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.01850v1-abstract-full" style="display: none;"> Seismic advances in generative AI algorithms for imagery, text, and other data types has led to the temptation to use synthetic data to train next-generation models. Repeating this process creates an autophagous (self-consuming) loop whose properties are poorly understood. We conduct a thorough analytical and empirical analysis using state-of-the-art generative image models of three families of autophagous loops that differ in how fixed or fresh real training data is available through the generations of training and in whether the samples from previous generation models have been biased to trade off data quality versus diversity. Our primary conclusion across all scenarios is that without enough fresh real data in each generation of an autophagous loop, future generative models are doomed to have their quality (precision) or diversity (recall) progressively decrease. We term this condition Model Autophagy Disorder (MAD), making analogy to mad cow disease. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.01850v1-abstract-full').style.display = 'none'; document.getElementById('2307.01850v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 31 figures, pre-print</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14507">arXiv:2305.14507</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.14507">pdf</a>, <a href="https://arxiv.org/format/2305.14507">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Deduction under Perturbed Evidence: Probing Student Simulation Capabilities of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14507v1-abstract-short" style="display: inline;"> We explore whether Large Language Models (LLMs) are capable of logical reasoning with distorted facts, which we call Deduction under Perturbed Evidence (DUPE). DUPE presents a unique challenge to LLMs since they typically rely on their parameters, which encode mostly accurate information, to reason and make inferences. However, in DUPE, LLMs must reason over manipulated or falsified evidence prese&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14507v1-abstract-full').style.display = 'inline'; document.getElementById('2305.14507v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14507v1-abstract-full" style="display: none;"> We explore whether Large Language Models (LLMs) are capable of logical reasoning with distorted facts, which we call Deduction under Perturbed Evidence (DUPE). DUPE presents a unique challenge to LLMs since they typically rely on their parameters, which encode mostly accurate information, to reason and make inferences. However, in DUPE, LLMs must reason over manipulated or falsified evidence present in their prompts, which can result in false conclusions that are valid only under the manipulated evidence. Our goal with DUPE is to determine whether LLMs can arrive at these false conclusions and identify whether the dominant factor influencing the deduction process is the encoded data in the parameters or the manipulated evidence in the prompts. To evaluate the DUPE capabilities of LLMs, we create a DUPEd version of the StrategyQA dataset, where facts are manipulated to reverse the answer to the question. Our findings show that even the most advanced GPT models struggle to reason on manipulated facts - showcasing poor DUPE skills - with accuracy dropping by 45% compared to the original dataset. We also investigate prompt settings inspired from student simulation models, which mitigate the accuracy drop to some extent. Our findings have practical implications for understanding the performance of LLMs in real-world applications such as student simulation models that involve reasoning over inaccurate information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14507v1-abstract-full').style.display = 'none'; document.getElementById('2305.14507v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13297">arXiv:2305.13297</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.13297">pdf</a>, <a href="https://arxiv.org/format/2305.13297">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Investigating the Role of Feed-Forward Networks in Transformers Using Parallel Attention and Feed-Forward Net Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13297v2-abstract-short" style="display: inline;"> This paper investigates the key role of Feed-Forward Networks (FFNs) in transformer models by utilizing the Parallel Attention and Feed-Forward Net Design (PAF) architecture, and comparing it to their Series Attention and Feed-Forward Net Design (SAF) counterparts. Central to the effectiveness of PAF are two main assumptions regarding the FFN block and the attention block within a layer: 1) the pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13297v2-abstract-full').style.display = 'inline'; document.getElementById('2305.13297v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13297v2-abstract-full" style="display: none;"> This paper investigates the key role of Feed-Forward Networks (FFNs) in transformer models by utilizing the Parallel Attention and Feed-Forward Net Design (PAF) architecture, and comparing it to their Series Attention and Feed-Forward Net Design (SAF) counterparts. Central to the effectiveness of PAF are two main assumptions regarding the FFN block and the attention block within a layer: 1) the primary function of the FFN block is to maintain isotropy among token embeddings and prevent their degeneration, and 2) the residual norm computed in the attention block is substantially smaller than the input token embedding norm. To empirically validate these assumptions, we train PAF variants of two large language models (RoBERTa-large and bert-large-uncased). Our results demonstrate that both assumptions hold true in the PAF design. This study contributes to a deeper understanding of the roles and interactions between FFNs and self-attention mechanisms in transformer architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13297v2-abstract-full').style.display = 'none'; document.getElementById('2305.13297v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13272">arXiv:2305.13272</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.13272">pdf</a>, <a href="https://arxiv.org/format/2305.13272">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CLASS: A Design Framework for building Intelligent Tutoring Systems based on Learning Science principles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Mallick%2C+D+B">Debshila Basu Mallick</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13272v2-abstract-short" style="display: inline;"> We present a design framework called Conversational Learning with Analytical Step-by-Step Strategies (CLASS) for building advanced Intelligent Tutoring Systems (ITS) powered by high-performance Large Language Models (LLMs). The CLASS framework empowers ITS with two key capabilities. First, through a carefully curated scaffolding dataset, CLASS equips ITS with essential problem-solving strategies,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13272v2-abstract-full').style.display = 'inline'; document.getElementById('2305.13272v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13272v2-abstract-full" style="display: none;"> We present a design framework called Conversational Learning with Analytical Step-by-Step Strategies (CLASS) for building advanced Intelligent Tutoring Systems (ITS) powered by high-performance Large Language Models (LLMs). The CLASS framework empowers ITS with two key capabilities. First, through a carefully curated scaffolding dataset, CLASS equips ITS with essential problem-solving strategies, enabling it to provide tutor-like, step-by-step guidance to students. Second, by using a dynamic conversational dataset, CLASS assists ITS in facilitating natural language interactions, fostering engaging student-tutor conversations. The CLASS framework also provides valuable insights into ITS&#39; internal decision-making process which allows seamless integration of user feedback, thus enabling continuous refinement and improvement. We also present a proof-of-concept ITS, referred to as SPOCK, which is trained using the CLASS framework with a focus on introductory college-level biology content. A carefully constructed protocol was developed for SPOCK&#39;s preliminary evaluation, examining aspects such as the factual accuracy and relevance of its responses. Experts in the field of biology offered favorable remarks, particularly highlighting SPOCK&#39;s capability to break down questions into manageable subproblems and provide encouraging responses to students. Code and models are available at https://github.com/luffycodes/Tutorbot-Spock. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13272v2-abstract-full').style.display = 'none'; document.getElementById('2305.13272v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper accepted at EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.05187">arXiv:2301.05187</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.05187">pdf</a>, <a href="https://arxiv.org/format/2301.05187">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> WIRE: Wavelet Implicit Neural Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Saragadam%2C+V">Vishwanath Saragadam</a>, <a href="/search/cs?searchtype=author&amp;query=LeJeune%2C+D">Daniel LeJeune</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+J">Jasper Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Balakrishnan%2C+G">Guha Balakrishnan</a>, <a href="/search/cs?searchtype=author&amp;query=Veeraraghavan%2C+A">Ashok Veeraraghavan</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.05187v1-abstract-short" style="display: inline;"> Implicit neural representations (INRs) have recently advanced numerous vision-related areas. INR performance depends strongly on the choice of the nonlinear activation function employed in its multilayer perceptron (MLP) network. A wide range of nonlinearities have been explored, but, unfortunately, current INRs designed to have high accuracy also suffer from poor robustness (to signal noise, para&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.05187v1-abstract-full').style.display = 'inline'; document.getElementById('2301.05187v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.05187v1-abstract-full" style="display: none;"> Implicit neural representations (INRs) have recently advanced numerous vision-related areas. INR performance depends strongly on the choice of the nonlinear activation function employed in its multilayer perceptron (MLP) network. A wide range of nonlinearities have been explored, but, unfortunately, current INRs designed to have high accuracy also suffer from poor robustness (to signal noise, parameter variation, etc.). Inspired by harmonic analysis, we develop a new, highly accurate and robust INR that does not exhibit this tradeoff. Wavelet Implicit neural REpresentation (WIRE) uses a continuous complex Gabor wavelet activation function that is well-known to be optimally concentrated in space-frequency and to have excellent biases for representing images. A wide range of experiments (image denoising, image inpainting, super-resolution, computed tomography reconstruction, image overfitting, and novel view synthesis with neural radiance fields) demonstrate that WIRE defines the new state of the art in INR accuracy, training time, and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.05187v1-abstract-full').style.display = 'none'; document.getElementById('2301.05187v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.09723">arXiv:2212.09723</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.09723">pdf</a>, <a href="https://arxiv.org/format/2212.09723">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MANER: Mask Augmented Named Entity Recognition for Extreme Low-Resource Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zichao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.09723v1-abstract-short" style="display: inline;"> This paper investigates the problem of Named Entity Recognition (NER) for extreme low-resource languages with only a few hundred tagged data samples. NER is a fundamental task in Natural Language Processing (NLP). A critical driver accelerating NER systems&#39; progress is the existence of large-scale language corpora that enable NER systems to achieve outstanding performance in languages such as Engl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09723v1-abstract-full').style.display = 'inline'; document.getElementById('2212.09723v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.09723v1-abstract-full" style="display: none;"> This paper investigates the problem of Named Entity Recognition (NER) for extreme low-resource languages with only a few hundred tagged data samples. NER is a fundamental task in Natural Language Processing (NLP). A critical driver accelerating NER systems&#39; progress is the existence of large-scale language corpora that enable NER systems to achieve outstanding performance in languages such as English and French with abundant training data. However, NER for low-resource languages remains relatively unexplored. In this paper, we introduce Mask Augmented Named Entity Recognition (MANER), a new methodology that leverages the distributional hypothesis of pre-trained masked language models (MLMs) for NER. The &lt;mask&gt; token in pre-trained MLMs encodes valuable semantic contextual information. MANER re-purposes the &lt;mask&gt; token for NER prediction. Specifically, we prepend the &lt;mask&gt; token to every word in a sentence for which we would like to predict the named entity tag. During training, we jointly fine-tune the MLM and a new NER prediction head attached to each &lt;mask&gt; token. We demonstrate that MANER is well-suited for NER in low-resource languages; our experiments show that for 100 languages with as few as 100 training examples, it improves on state-of-the-art methods by up to 48% and by 12% on average on F1 score. We also perform detailed analyses and ablation studies to understand the scenarios that are best-suited to MANER. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09723v1-abstract-full').style.display = 'none'; document.getElementById('2212.09723v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.06345">arXiv:2212.06345</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.06345">pdf</a>, <a href="https://arxiv.org/format/2212.06345">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Foveated Thermal Computational Imaging in the Wild Using All-Silicon Meta-Optics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Saragadam%2C+V">Vishwanath Saragadam</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+Z">Zheyi Han</a>, <a href="/search/cs?searchtype=author&amp;query=Boominathan%2C+V">Vivek Boominathan</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+L">Luocheng Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+S">Shiyu Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Fr%C3%B6ch%2C+J+E">Johannes E. Fr枚ch</a>, <a href="/search/cs?searchtype=author&amp;query=B%C3%B6hringer%2C+K+F">Karl F. B枚hringer</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Majumdar%2C+A">Arka Majumdar</a>, <a href="/search/cs?searchtype=author&amp;query=Veeraraghavan%2C+A">Ashok Veeraraghavan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.06345v1-abstract-short" style="display: inline;"> Foveated imaging provides a better tradeoff between situational awareness (field of view) and resolution and is critical in long-wavelength infrared regimes because of the size, weight, power, and cost of thermal sensors. We demonstrate computational foveated imaging by exploiting the ability of a meta-optical frontend to discriminate between different polarization states and a computational backe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.06345v1-abstract-full').style.display = 'inline'; document.getElementById('2212.06345v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.06345v1-abstract-full" style="display: none;"> Foveated imaging provides a better tradeoff between situational awareness (field of view) and resolution and is critical in long-wavelength infrared regimes because of the size, weight, power, and cost of thermal sensors. We demonstrate computational foveated imaging by exploiting the ability of a meta-optical frontend to discriminate between different polarization states and a computational backend to reconstruct the captured image/video. The frontend is a three-element optic: the first element which we call the &#34;foveal&#34; element is a metalens that focuses s-polarized light at a distance of $f_1$ without affecting the p-polarized light; the second element which we call the &#34;perifoveal&#34; element is another metalens that focuses p-polarized light at a distance of $f_2$ without affecting the s-polarized light. The third element is a freely rotating polarizer that dynamically changes the mixing ratios between the two polarization states. Both the foveal element (focal length = 150mm; diameter = 75mm), and the perifoveal element (focal length = 25mm; diameter = 25mm) were fabricated as polarization-sensitive, all-silicon, meta surfaces resulting in a large-aperture, 1:6 foveal expansion, thermal imaging capability. A computational backend then utilizes a deep image prior to separate the resultant multiplexed image or video into a foveated image consisting of a high-resolution center and a lower-resolution large field of view context. We build a first-of-its-kind prototype system and demonstrate 12 frames per second real-time, thermal, foveated image, and video capture in the wild. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.06345v1-abstract-full').style.display = 'none'; document.getElementById('2212.06345v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.11074">arXiv:2211.11074</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.11074">pdf</a>, <a href="https://arxiv.org/format/2211.11074">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Frozen Overparameterization: A Double Descent Perspective on Transfer Learning of Deep Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dar%2C+Y">Yehuda Dar</a>, <a href="/search/cs?searchtype=author&amp;query=Luzi%2C+L">Lorenzo Luzi</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.11074v2-abstract-short" style="display: inline;"> We study the generalization behavior of transfer learning of deep neural networks (DNNs). We adopt the overparameterization perspective -- featuring interpolation of the training data (i.e., approximately zero train error) and the double descent phenomenon -- to explain the delicate effect of the transfer learning setting on generalization performance. We study how the generalization behavior of t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11074v2-abstract-full').style.display = 'inline'; document.getElementById('2211.11074v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.11074v2-abstract-full" style="display: none;"> We study the generalization behavior of transfer learning of deep neural networks (DNNs). We adopt the overparameterization perspective -- featuring interpolation of the training data (i.e., approximately zero train error) and the double descent phenomenon -- to explain the delicate effect of the transfer learning setting on generalization performance. We study how the generalization behavior of transfer learning is affected by the dataset size in the source and target tasks, the number of transferred layers that are kept frozen in the target DNN training, and the similarity between the source and target tasks. We show that the test error evolution during the target DNN training has a more significant double descent effect when the target training dataset is sufficiently large. In addition, a larger source training dataset can yield a slower target DNN training. Moreover, we demonstrate that the number of frozen layers can determine whether the transfer learning is effectively underparameterized or overparameterized and, in turn, this may induce a freezing-wise double descent phenomenon that determines the relative success or failure of learning. Also, we show that the double descent phenomenon may make a transfer from a less related source task better than a transfer from a more related source task. We establish our results using image classification experiments with the ResNet, DenseNet and the vision transformer (ViT) architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11074v2-abstract-full').style.display = 'none'; document.getElementById('2211.11074v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.03751">arXiv:2211.03751</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.03751">pdf</a>, <a href="https://arxiv.org/format/2211.03751">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Asymptotics of the Sketched Pseudoinverse </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=LeJeune%2C+D">Daniel LeJeune</a>, <a href="/search/cs?searchtype=author&amp;query=Patil%2C+P">Pratik Patil</a>, <a href="/search/cs?searchtype=author&amp;query=Javadi%2C+H">Hamid Javadi</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Tibshirani%2C+R+J">Ryan J. Tibshirani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.03751v4-abstract-short" style="display: inline;"> We take a random matrix theory approach to random sketching and show an asymptotic first-order equivalence of the regularized sketched pseudoinverse of a positive semidefinite matrix to a certain evaluation of the resolvent of the same matrix. We focus on real-valued regularization and extend previous results on an asymptotic equivalence of random matrices to the real setting, providing a precise&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03751v4-abstract-full').style.display = 'inline'; document.getElementById('2211.03751v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.03751v4-abstract-full" style="display: none;"> We take a random matrix theory approach to random sketching and show an asymptotic first-order equivalence of the regularized sketched pseudoinverse of a positive semidefinite matrix to a certain evaluation of the resolvent of the same matrix. We focus on real-valued regularization and extend previous results on an asymptotic equivalence of random matrices to the real setting, providing a precise characterization of the equivalence even under negative regularization, including a precise characterization of the smallest nonzero eigenvalue of the sketched matrix, which may be of independent interest. We then further characterize the second-order equivalence of the sketched pseudoinverse. We also apply our results to the analysis of the sketch-and-project method and to sketched ridge regression. Lastly, we prove that these results generalize to asymptotically free sketching matrices, obtaining the resulting equivalence for orthogonal sketching matrices and comparing our results to several common sketches used in practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03751v4-abstract-full').style.display = 'none'; document.getElementById('2211.03751v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">45 pages, 9 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 15B52; 46L54; 62J07 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.12565">arXiv:2210.12565</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.12565">pdf</a>, <a href="https://arxiv.org/format/2210.12565">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Visual Tour Of Current Challenges In Multimodal Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.12565v1-abstract-short" style="display: inline;"> Transformer models trained on massive text corpora have become the de facto models for a wide range of natural language processing tasks. However, learning effective word representations for function words remains challenging. Multimodal learning, which visually grounds transformer models in imagery, can overcome the challenges to some extent; however, there is still much work to be done. In this&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12565v1-abstract-full').style.display = 'inline'; document.getElementById('2210.12565v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.12565v1-abstract-full" style="display: none;"> Transformer models trained on massive text corpora have become the de facto models for a wide range of natural language processing tasks. However, learning effective word representations for function words remains challenging. Multimodal learning, which visually grounds transformer models in imagery, can overcome the challenges to some extent; however, there is still much work to be done. In this study, we explore the extent to which visual grounding facilitates the acquisition of function words using stable diffusion models that employ multimodal models for text-to-image generation. Out of seven categories of function words, along with numerous subcategories, we find that stable diffusion models effectively model only a small fraction of function words -- a few pronoun subcategories and relatives. We hope that our findings will stimulate the development of new datasets and approaches that enable multimodal models to learn better representations of function words. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12565v1-abstract-full').style.display = 'none'; document.getElementById('2210.12565v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.12100">arXiv:2210.12100</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.12100">pdf</a>, <a href="https://arxiv.org/format/2210.12100">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Boomerang: Local sampling on image manifolds using diffusion models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luzi%2C+L">Lorenzo Luzi</a>, <a href="/search/cs?searchtype=author&amp;query=Mayer%2C+P+M">Paul M Mayer</a>, <a href="/search/cs?searchtype=author&amp;query=Casco-Rodriguez%2C+J">Josue Casco-Rodriguez</a>, <a href="/search/cs?searchtype=author&amp;query=Siahkoohi%2C+A">Ali Siahkoohi</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.12100v2-abstract-short" style="display: inline;"> The inference stage of diffusion models can be seen as running a reverse-time diffusion stochastic differential equation, where samples from a Gaussian latent distribution are transformed into samples from a target distribution that usually reside on a low-dimensional manifold, e.g., an image manifold. The intermediate values between the initial latent space and the image manifold can be interpret&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12100v2-abstract-full').style.display = 'inline'; document.getElementById('2210.12100v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.12100v2-abstract-full" style="display: none;"> The inference stage of diffusion models can be seen as running a reverse-time diffusion stochastic differential equation, where samples from a Gaussian latent distribution are transformed into samples from a target distribution that usually reside on a low-dimensional manifold, e.g., an image manifold. The intermediate values between the initial latent space and the image manifold can be interpreted as noisy images, with the amount of noise determined by the forward diffusion process noise schedule. We utilize this interpretation to present Boomerang, an approach for local sampling of image manifolds. As implied by its name, Boomerang local sampling involves adding noise to an input image, moving it closer to the latent space, and then mapping it back to the image manifold through a partial reverse diffusion process. Thus, Boomerang generates images on the manifold that are ``similar,&#39;&#39; but nonidentical, to the original input image. We can control the proximity of the generated images to the original by adjusting the amount of noise added. Furthermore, due to the stochastic nature of the reverse diffusion process in Boomerang, the generated images display a certain degree of stochasticity, allowing us to obtain local samples from the manifold without encountering any duplicates. Boomerang offers the flexibility to work seamlessly with any pretrained diffusion model, such as Stable Diffusion, without necessitating any adjustments to the reverse diffusion process. We present three applications for Boomerang. First, we provide a framework for constructing privacy-preserving datasets having controllable degrees of anonymity. Second, we show that using Boomerang for data augmentation increases generalization performance and outperforms state-of-the-art synthetic data augmentation. Lastly, we introduce a perceptual image enhancement framework, which enables resolution enhancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12100v2-abstract-full').style.display = 'none'; document.getElementById('2210.12100v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in Transactions on Machine Learning Research</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.14778">arXiv:2209.14778</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.14778">pdf</a>, <a href="https://arxiv.org/format/2209.14778">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Geometry">cs.CG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Batch Normalization Explained </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Balestriero%2C+R">Randall Balestriero</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.14778v1-abstract-short" style="display: inline;"> A critically important, ubiquitous, and yet poorly understood ingredient in modern deep networks (DNs) is batch normalization (BN), which centers and normalizes the feature maps. To date, only limited progress has been made understanding why BN boosts DN learning and inference performance; work has focused exclusively on showing that BN smooths a DN&#39;s loss landscape. In this paper, we study BN the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.14778v1-abstract-full').style.display = 'inline'; document.getElementById('2209.14778v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.14778v1-abstract-full" style="display: none;"> A critically important, ubiquitous, and yet poorly understood ingredient in modern deep networks (DNs) is batch normalization (BN), which centers and normalizes the feature maps. To date, only limited progress has been made understanding why BN boosts DN learning and inference performance; work has focused exclusively on showing that BN smooths a DN&#39;s loss landscape. In this paper, we study BN theoretically from the perspective of function approximation; we exploit the fact that most of today&#39;s state-of-the-art DNs are continuous piecewise affine (CPA) splines that fit a predictor to the training data via affine mappings defined over a partition of the input space (the so-called &#34;linear regions&#34;). {\em We demonstrate that BN is an unsupervised learning technique that -- independent of the DN&#39;s weights or gradient-based learning -- adapts the geometry of a DN&#39;s spline partition to match the data.} BN provides a &#34;smart initialization&#34; that boosts the performance of DN learning, because it adapts even a DN initialized with random weights to align its spline partition with the data. We also show that the variation of BN statistics between mini-batches introduces a dropout-like random perturbation to the partition boundaries and hence the decision boundary for classification problems. This per mini-batch perturbation reduces overfitting and improves generalization by increasing the margin between the training samples and the decision boundary. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.14778v1-abstract-full').style.display = 'none'; document.getElementById('2209.14778v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.00579">arXiv:2208.00579</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.00579">pdf</a>, <a href="https://arxiv.org/format/2208.00579">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Momentum Transformer: Closing the Performance Gap Between Self-attention and Its Linearization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Tan Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Kirby%2C+R+M">Robert M. Kirby</a>, <a href="/search/cs?searchtype=author&amp;query=Osher%2C+S+J">Stanley J. Osher</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Bao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.00579v1-abstract-short" style="display: inline;"> Transformers have achieved remarkable success in sequence modeling and beyond but suffer from quadratic computational and memory complexities with respect to the length of the input sequence. Leveraging techniques include sparse and linear attention and hashing tricks; efficient transformers have been proposed to reduce the quadratic complexity of transformers but significantly degrade the accurac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00579v1-abstract-full').style.display = 'inline'; document.getElementById('2208.00579v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.00579v1-abstract-full" style="display: none;"> Transformers have achieved remarkable success in sequence modeling and beyond but suffer from quadratic computational and memory complexities with respect to the length of the input sequence. Leveraging techniques include sparse and linear attention and hashing tricks; efficient transformers have been proposed to reduce the quadratic complexity of transformers but significantly degrade the accuracy. In response, we first interpret the linear attention and residual connections in computing the attention map as gradient descent steps. We then introduce momentum into these components and propose the \emph{momentum transformer}, which utilizes momentum to improve the accuracy of linear transformers while maintaining linear memory and computational complexities. Furthermore, we develop an adaptive strategy to compute the momentum value for our model based on the optimal momentum for quadratic optimization. This adaptive momentum eliminates the need to search for the optimal momentum value and further enhances the performance of the momentum transformer. A range of experiments on both autoregressive and non-autoregressive tasks, including image generation and machine translation, demonstrate that the momentum transformer outperforms popular linear transformers in training efficiency and accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00579v1-abstract-full').style.display = 'none'; document.getElementById('2208.00579v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 5 figures. arXiv admin note: substantial text overlap with arXiv:2110.07034</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 65Pxx </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.14055">arXiv:2205.14055</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.14055">pdf</a>, <a href="https://arxiv.org/format/2205.14055">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A Blessing of Dimensionality in Membership Inference through Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tan%2C+J">Jasper Tan</a>, <a href="/search/cs?searchtype=author&amp;query=LeJeune%2C+D">Daniel LeJeune</a>, <a href="/search/cs?searchtype=author&amp;query=Mason%2C+B">Blake Mason</a>, <a href="/search/cs?searchtype=author&amp;query=Javadi%2C+H">Hamid Javadi</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.14055v2-abstract-short" style="display: inline;"> Is overparameterization a privacy liability? In this work, we study the effect that the number of parameters has on a classifier&#39;s vulnerability to membership inference attacks. We first demonstrate how the number of parameters of a model can induce a privacy--utility trade-off: increasing the number of parameters generally improves generalization performance at the expense of lower privacy. Howev&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14055v2-abstract-full').style.display = 'inline'; document.getElementById('2205.14055v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.14055v2-abstract-full" style="display: none;"> Is overparameterization a privacy liability? In this work, we study the effect that the number of parameters has on a classifier&#39;s vulnerability to membership inference attacks. We first demonstrate how the number of parameters of a model can induce a privacy--utility trade-off: increasing the number of parameters generally improves generalization performance at the expense of lower privacy. However, remarkably, we then show that if coupled with proper regularization, increasing the number of parameters of a model can actually simultaneously increase both its privacy and performance, thereby eliminating the privacy--utility trade-off. Theoretically, we demonstrate this curious phenomenon for logistic regression with ridge regularization in a bi-level feature ensemble setting. Pursuant to our theoretical exploration, we develop a novel leave-one-out analysis tool to precisely characterize the vulnerability of a linear classifier to the optimal membership inference attack. We empirically exhibit this &#34;blessing of dimensionality&#34; for neural networks on a variety of tasks using early stopping as the regularizer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14055v2-abstract-full').style.display = 'none'; document.getElementById('2205.14055v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.03145">arXiv:2204.03145</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.03145">pdf</a>, <a href="https://arxiv.org/format/2204.03145">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> DeepTensor: Low-Rank Tensor Decomposition with Deep Network Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Saragadam%2C+V">Vishwanath Saragadam</a>, <a href="/search/cs?searchtype=author&amp;query=Balestriero%2C+R">Randall Balestriero</a>, <a href="/search/cs?searchtype=author&amp;query=Veeraraghavan%2C+A">Ashok Veeraraghavan</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.03145v1-abstract-short" style="display: inline;"> DeepTensor is a computationally efficient framework for low-rank decomposition of matrices and tensors using deep generative networks. We decompose a tensor as the product of low-rank tensor factors (e.g., a matrix as the outer product of two vectors), where each low-rank tensor is generated by a deep network (DN) that is trained in a self-supervised manner to minimize the mean-squared approximati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.03145v1-abstract-full').style.display = 'inline'; document.getElementById('2204.03145v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.03145v1-abstract-full" style="display: none;"> DeepTensor is a computationally efficient framework for low-rank decomposition of matrices and tensors using deep generative networks. We decompose a tensor as the product of low-rank tensor factors (e.g., a matrix as the outer product of two vectors), where each low-rank tensor is generated by a deep network (DN) that is trained in a self-supervised manner to minimize the mean-squared approximation error. Our key observation is that the implicit regularization inherent in DNs enables them to capture nonlinear signal structures (e.g., manifolds) that are out of the reach of classical linear methods like the singular value decomposition (SVD) and principal component analysis (PCA). Furthermore, in contrast to the SVD and PCA, whose performance deteriorates when the tensor&#39;s entries deviate from additive white Gaussian noise, we demonstrate that the performance of DeepTensor is robust to a wide range of distributions. We validate that DeepTensor is a robust and computationally efficient drop-in replacement for the SVD, PCA, nonnegative matrix factorization (NMF), and similar decompositions by exploring a range of real-world applications, including hyperspectral image denoising, 3D MRI tomography, and image classification. In particular, DeepTensor offers a 6dB signal-to-noise ratio improvement over standard denoising methods for signals corrupted by Poisson noise and learns to decompose 3D tensors 60 times faster than a single DN equipped with 3D convolutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.03145v1-abstract-full').style.display = 'none'; document.getElementById('2204.03145v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.03716">arXiv:2203.03716</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.03716">pdf</a>, <a href="https://arxiv.org/format/2203.03716">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GPT-based Open-Ended Knowledge Tracing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zichao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+A">Andrew Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.03716v4-abstract-short" style="display: inline;"> In education applications, knowledge tracing refers to the problem of estimating students&#39; time-varying concept/skill mastery level from their past responses to questions and predicting their future performance. One key limitation of most existing knowledge tracing methods is that they treat student responses to questions as binary-valued, i.e., whether they are correct or incorrect. Response corr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03716v4-abstract-full').style.display = 'inline'; document.getElementById('2203.03716v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.03716v4-abstract-full" style="display: none;"> In education applications, knowledge tracing refers to the problem of estimating students&#39; time-varying concept/skill mastery level from their past responses to questions and predicting their future performance. One key limitation of most existing knowledge tracing methods is that they treat student responses to questions as binary-valued, i.e., whether they are correct or incorrect. Response correctness analysis/prediction ignores important information on student knowledge contained in the exact content of the responses, especially for open-ended questions. In this paper, we conduct the first exploration into open-ended knowledge tracing (OKT) by studying the new task of predicting students&#39; exact open-ended responses to questions. Our work is grounded in the domain of computer science education with programming questions. We develop an initial solution to the OKT problem, a student knowledge-guided code generation approach, that combines program synthesis methods using language models with student knowledge tracing methods. We also conduct a series of quantitative and qualitative experiments on a real-world student code dataset to validate OKT and demonstrate its promise in educational applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03716v4-abstract-full').style.display = 'none'; document.getElementById('2203.03716v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted at EMNLP 2022. The code can be found at https://github.com/lucy66666/OKT</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.03099">arXiv:2203.03099</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.03099">pdf</a>, <a href="https://arxiv.org/format/2203.03099">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Spectral Theory">math.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s00365-022-09601-5">10.1007/s00365-022-09601-5 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Singular Value Perturbation and Deep Network Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Riedi%2C+R+H">Rudolf H. Riedi</a>, <a href="/search/cs?searchtype=author&amp;query=Balestriero%2C+R">Randall Balestriero</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.03099v4-abstract-short" style="display: inline;"> We develop new theoretical results on matrix perturbation to shed light on the impact of architecture on the performance of a deep network. In particular, we explain analytically what deep learning practitioners have long observed empirically: the parameters of some deep architectures (e.g., residual networks, ResNets, and Dense networks, DenseNets) are easier to optimize than others (e.g., convol&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03099v4-abstract-full').style.display = 'inline'; document.getElementById('2203.03099v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.03099v4-abstract-full" style="display: none;"> We develop new theoretical results on matrix perturbation to shed light on the impact of architecture on the performance of a deep network. In particular, we explain analytically what deep learning practitioners have long observed empirically: the parameters of some deep architectures (e.g., residual networks, ResNets, and Dense networks, DenseNets) are easier to optimize than others (e.g., convolutional networks, ConvNets). Building on our earlier work connecting deep networks with continuous piecewise-affine splines, we develop an exact local linear representation of a deep network layer for a family of modern deep networks that includes ConvNets at one end of a spectrum and ResNets, DenseNets, and other networks with skip connections at the other. For regression and classification tasks that optimize the squared-error loss, we show that the optimization loss surface of a modern deep network is piecewise quadratic in the parameters, with local shape governed by the singular values of a matrix that is a function of the local linear representation. We develop new perturbation results for how the singular values of matrices of this sort behave as we add a fraction of the identity and multiply by certain diagonal matrices. A direct application of our perturbation results explains analytically why a network with skip connections (such as a ResNet or DenseNet) is easier to optimize than a ConvNet: thanks to its more stable singular values and smaller condition number, the local loss surface of such a network is less erratic, less eccentric, and features local minima that are more accommodating to gradient-based optimization. Our results also shed new light on the impact of different nonlinear activation functions on a deep network&#39;s singular values, regardless of its architecture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03099v4-abstract-full').style.display = 'none'; document.getElementById('2203.03099v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Constr Approx (2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.11811">arXiv:2202.11811</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.11811">pdf</a>, <a href="https://arxiv.org/format/2202.11811">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NeuroView-RNN: It&#39;s About Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Barberan%2C+C">CJ Barberan</a>, <a href="/search/cs?searchtype=author&amp;query=Alemohammad%2C+S">Sina Alemohammad</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Balestriero%2C+R">Randall Balestriero</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.11811v1-abstract-short" style="display: inline;"> Recurrent Neural Networks (RNNs) are important tools for processing sequential data such as time-series or video. Interpretability is defined as the ability to be understood by a person and is different from explainability, which is the ability to be explained in a mathematical formulation. A key interpretability issue with RNNs is that it is not clear how each hidden state per time step contribut&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.11811v1-abstract-full').style.display = 'inline'; document.getElementById('2202.11811v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.11811v1-abstract-full" style="display: none;"> Recurrent Neural Networks (RNNs) are important tools for processing sequential data such as time-series or video. Interpretability is defined as the ability to be understood by a person and is different from explainability, which is the ability to be explained in a mathematical formulation. A key interpretability issue with RNNs is that it is not clear how each hidden state per time step contributes to the decision-making process in a quantitative manner. We propose NeuroView-RNN as a family of new RNN architectures that explains how all the time steps are used for the decision-making process. Each member of the family is derived from a standard RNN architecture by concatenation of the hidden steps into a global linear classifier. The global linear classifier has all the hidden states as the input, so the weights of the classifier have a linear mapping to the hidden states. Hence, from the weights, NeuroView-RNN can quantify how important each time step is to a particular decision. As a bonus, NeuroView-RNN also offers higher accuracy in many cases compared to the RNNs and their variants. We showcase the benefits of NeuroView-RNN by evaluating on a multitude of diverse time-series datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.11811v1-abstract-full').style.display = 'none'; document.getElementById('2202.11811v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 13 figures, 9 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.10649">arXiv:2202.10649</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.10649">pdf</a>, <a href="https://arxiv.org/ps/2202.10649">ps</a>, <a href="https://arxiv.org/format/2202.10649">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TSP.2022.3223217">10.1109/TSP.2022.3223217 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> On Local Distributions in Graph Signal Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Roddenberry%2C+T+M">T. Mitchell Roddenberry</a>, <a href="/search/cs?searchtype=author&amp;query=Gama%2C+F">Fernando Gama</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Segarra%2C+S">Santiago Segarra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.10649v1-abstract-short" style="display: inline;"> Graph filtering is the cornerstone operation in graph signal processing (GSP). Thus, understanding it is key in developing potent GSP methods. Graph filters are local and distributed linear operations, whose output depends only on the local neighborhood of each node. Moreover, a graph filter&#39;s output can be computed separately at each node by carrying out repeated exchanges with immediate neighbor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.10649v1-abstract-full').style.display = 'inline'; document.getElementById('2202.10649v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.10649v1-abstract-full" style="display: none;"> Graph filtering is the cornerstone operation in graph signal processing (GSP). Thus, understanding it is key in developing potent GSP methods. Graph filters are local and distributed linear operations, whose output depends only on the local neighborhood of each node. Moreover, a graph filter&#39;s output can be computed separately at each node by carrying out repeated exchanges with immediate neighbors. Graph filters can be compactly written as polynomials of a graph shift operator (typically, a sparse matrix description of the graph). This has led to relating the properties of the filters with the spectral properties of the corresponding matrix -- which encodes global structure of the graph. In this work, we propose a framework that relies solely on the local distribution of the neighborhoods of a graph. The crux of this approach is to describe graphs and graph signals in terms of a measurable space of rooted balls. Leveraging this, we are able to seamlessly compare graphs of different sizes and coming from different models, yielding results on the convergence of spectral densities, transferability of filters across arbitrary graphs, and continuity of graph signal properties with respect to the distribution of local substructures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.10649v1-abstract-full').style.display = 'none'; document.getElementById('2202.10649v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.03532">arXiv:2202.03532</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.03532">pdf</a>, <a href="https://arxiv.org/format/2202.03532">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MINER: Multiscale Implicit Neural Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Saragadam%2C+V">Vishwanath Saragadam</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+J">Jasper Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Balakrishnan%2C+G">Guha Balakrishnan</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Veeraraghavan%2C+A">Ashok Veeraraghavan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.03532v2-abstract-short" style="display: inline;"> We introduce a new neural signal model designed for efficient high-resolution representation of large-scale signals. The key innovation in our multiscale implicit neural representation (MINER) is an internal representation via a Laplacian pyramid, which provides a sparse multiscale decomposition of the signal that captures orthogonal parts of the signal across scales. We leverage the advantages of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.03532v2-abstract-full').style.display = 'inline'; document.getElementById('2202.03532v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.03532v2-abstract-full" style="display: none;"> We introduce a new neural signal model designed for efficient high-resolution representation of large-scale signals. The key innovation in our multiscale implicit neural representation (MINER) is an internal representation via a Laplacian pyramid, which provides a sparse multiscale decomposition of the signal that captures orthogonal parts of the signal across scales. We leverage the advantages of the Laplacian pyramid by representing small disjoint patches of the pyramid at each scale with a small MLP. This enables the capacity of the network to adaptively increase from coarse to fine scales, and only represent parts of the signal with strong signal energy. The parameters of each MLP are optimized from coarse-to-fine scale which results in faster approximations at coarser scales, thereby ultimately an extremely fast training process. We apply MINER to a range of large-scale signal representation tasks, including gigapixel images and very large point clouds, and demonstrate that it requires fewer than 25% of the parameters, 33% of the memory footprint, and 10% of the computation time of competing techniques such as ACORN to reach the same representation accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.03532v2-abstract-full').style.display = 'none'; document.getElementById('2202.03532v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, accepted to ECCV 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.01243">arXiv:2202.01243</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.01243">pdf</a>, <a href="https://arxiv.org/format/2202.01243">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Parameters or Privacy: A Provable Tradeoff Between Overparameterization and Membership Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tan%2C+J">Jasper Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Mason%2C+B">Blake Mason</a>, <a href="/search/cs?searchtype=author&amp;query=Javadi%2C+H">Hamid Javadi</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.01243v2-abstract-short" style="display: inline;"> A surprising phenomenon in modern machine learning is the ability of a highly overparameterized model to generalize well (small error on the test data) even when it is trained to memorize the training data (zero error on the training data). This has led to an arms race towards increasingly overparameterized models (c.f., deep learning). In this paper, we study an underexplored hidden cost of overp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01243v2-abstract-full').style.display = 'inline'; document.getElementById('2202.01243v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.01243v2-abstract-full" style="display: none;"> A surprising phenomenon in modern machine learning is the ability of a highly overparameterized model to generalize well (small error on the test data) even when it is trained to memorize the training data (zero error on the training data). This has led to an arms race towards increasingly overparameterized models (c.f., deep learning). In this paper, we study an underexplored hidden cost of overparameterization: the fact that overparameterized models may be more vulnerable to privacy attacks, in particular the membership inference attack that predicts the (potentially sensitive) examples used to train a model. We significantly extend the relatively few empirical results on this problem by theoretically proving for an overparameterized linear regression model in the Gaussian data setting that membership inference vulnerability increases with the number of parameters. Moreover, a range of empirical studies indicates that more complex, nonlinear models exhibit the same behavior. Finally, we extend our analysis towards ridge-regularized linear regression and show in the Gaussian data setting that increased regularization also increases membership inference vulnerability in the overparameterized regime. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01243v2-abstract-full').style.display = 'none'; document.getElementById('2202.01243v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.08678">arXiv:2110.08678</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.08678">pdf</a>, <a href="https://arxiv.org/format/2110.08678">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Improving Transformers with Probabilistic Attention Keys </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Tam Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+M">Tan M. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+D+D">Dung D. Le</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+D+K">Duy Khuong Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V">Viet-Anh Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Ho%2C+N">Nhat Ho</a>, <a href="/search/cs?searchtype=author&amp;query=Osher%2C+S+J">Stanley J. Osher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.08678v2-abstract-short" style="display: inline;"> Multi-head attention is a driving force behind state-of-the-art transformers, which achieve remarkable performance across a variety of natural language processing (NLP) and computer vision tasks. It has been observed that for many applications, those attention heads learn redundant embedding, and most of them can be removed without degrading the performance of the model. Inspired by this observati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08678v2-abstract-full').style.display = 'inline'; document.getElementById('2110.08678v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.08678v2-abstract-full" style="display: none;"> Multi-head attention is a driving force behind state-of-the-art transformers, which achieve remarkable performance across a variety of natural language processing (NLP) and computer vision tasks. It has been observed that for many applications, those attention heads learn redundant embedding, and most of them can be removed without degrading the performance of the model. Inspired by this observation, we propose Transformer with a Mixture of Gaussian Keys (Transformer-MGK), a novel transformer architecture that replaces redundant heads in transformers with a mixture of keys at each head. These mixtures of keys follow a Gaussian mixture model and allow each attention head to focus on different parts of the input sequence efficiently. Compared to its conventional transformer counterpart, Transformer-MGK accelerates training and inference, has fewer parameters, and requires fewer FLOPs to compute while achieving comparable or better accuracy across tasks. Transformer-MGK can also be easily extended to use with linear attention. We empirically demonstrate the advantage of Transformer-MGK in a range of practical applications, including language modeling and tasks that involve very long sequences. On the Wikitext-103 and Long Range Arena benchmark, Transformer-MGKs with 4 heads attain comparable or better performance to the baseline transformers with 8 heads. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08678v2-abstract-full').style.display = 'none'; document.getElementById('2110.08678v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 16 figures, 10 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 39th International Conference on Machine Learning, Baltimore, Maryland, USA, PMLR 162, 2022 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.07778">arXiv:2110.07778</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.07778">pdf</a>, <a href="https://arxiv.org/format/2110.07778">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NeuroView: Explainable Deep Network Decision Making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Barberan%2C+C">CJ Barberan</a>, <a href="/search/cs?searchtype=author&amp;query=Balestriero%2C+R">Randall Balestriero</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.07778v1-abstract-short" style="display: inline;"> Deep neural networks (DNs) provide superhuman performance in numerous computer vision tasks, yet it remains unclear exactly which of a DN&#39;s units contribute to a particular decision. NeuroView is a new family of DN architectures that are interpretable/explainable by design. Each member of the family is derived from a standard DN architecture by vector quantizing the unit output values and feeding&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.07778v1-abstract-full').style.display = 'inline'; document.getElementById('2110.07778v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.07778v1-abstract-full" style="display: none;"> Deep neural networks (DNs) provide superhuman performance in numerous computer vision tasks, yet it remains unclear exactly which of a DN&#39;s units contribute to a particular decision. NeuroView is a new family of DN architectures that are interpretable/explainable by design. Each member of the family is derived from a standard DN architecture by vector quantizing the unit output values and feeding them into a global linear classifier. The resulting architecture establishes a direct, causal link between the state of each unit and the classification decision. We validate NeuroView on standard datasets and classification tasks to show that how its unit/class mapping aids in understanding the decision-making process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.07778v1-abstract-full').style.display = 'none'; document.getElementById('2110.07778v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.05240">arXiv:2110.05240</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.05240">pdf</a>, <a href="https://arxiv.org/format/2110.05240">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Evaluating generative networks using Gaussian mixtures of image features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luzi%2C+L">Lorenzo Luzi</a>, <a href="/search/cs?searchtype=author&amp;query=Marrero%2C+C+O">Carlos Ortiz Marrero</a>, <a href="/search/cs?searchtype=author&amp;query=Wynar%2C+N">Nile Wynar</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Henry%2C+M+J">Michael J. Henry</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.05240v2-abstract-short" style="display: inline;"> We develop a measure for evaluating the performance of generative networks given two sets of images. A popular performance measure currently used to do this is the Fr茅chet Inception Distance (FID). FID assumes that images featurized using the penultimate layer of Inception-v3 follow a Gaussian distribution, an assumption which cannot be violated if we wish to use FID as a metric. However, we show&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.05240v2-abstract-full').style.display = 'inline'; document.getElementById('2110.05240v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.05240v2-abstract-full" style="display: none;"> We develop a measure for evaluating the performance of generative networks given two sets of images. A popular performance measure currently used to do this is the Fr茅chet Inception Distance (FID). FID assumes that images featurized using the penultimate layer of Inception-v3 follow a Gaussian distribution, an assumption which cannot be violated if we wish to use FID as a metric. However, we show that Inception-v3 features of the ImageNet dataset are not Gaussian; in particular, every single marginal is not Gaussian. To remedy this problem, we model the featurized images using Gaussian mixture models (GMMs) and compute the 2-Wasserstein distance restricted to GMMs. We define a performance measure, which we call WaM, on two sets of images by using Inception-v3 (or another classifier) to featurize the images, estimate two GMMs, and use the restricted $2$-Wasserstein distance to compare the GMMs. We experimentally show the advantages of WaM over FID, including how FID is more sensitive than WaM to imperceptible image perturbations. By modelling the non-Gaussian features obtained from Inception-v3 as GMMs and using a GMM metric, we can more accurately evaluate generative network performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.05240v2-abstract-full').style.display = 'none'; document.getElementById('2110.05240v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.04945">arXiv:2110.04945</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.04945">pdf</a>, <a href="https://arxiv.org/format/2110.04945">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NFT-K: Non-Fungible Tangent Kernels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alemohammad%2C+S">Sina Alemohammad</a>, <a href="/search/cs?searchtype=author&amp;query=Babaei%2C+H">Hossein Babaei</a>, <a href="/search/cs?searchtype=author&amp;query=Barberan%2C+C">CJ Barberan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Luzi%2C+L">Lorenzo Luzi</a>, <a href="/search/cs?searchtype=author&amp;query=Mason%2C+B">Blake Mason</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.04945v1-abstract-short" style="display: inline;"> Deep neural networks have become essential for numerous applications due to their strong empirical performance such as vision, RL, and classification. Unfortunately, these networks are quite difficult to interpret, and this limits their applicability in settings where interpretability is important for safety, such as medical imaging. One type of deep neural network is neural tangent kernel that is&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.04945v1-abstract-full').style.display = 'inline'; document.getElementById('2110.04945v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.04945v1-abstract-full" style="display: none;"> Deep neural networks have become essential for numerous applications due to their strong empirical performance such as vision, RL, and classification. Unfortunately, these networks are quite difficult to interpret, and this limits their applicability in settings where interpretability is important for safety, such as medical imaging. One type of deep neural network is neural tangent kernel that is similar to a kernel machine that provides some aspect of interpretability. To further contribute interpretability with respect to classification and the layers, we develop a new network as a combination of multiple neural tangent kernels, one to model each layer of the deep neural network individually as opposed to past work which attempts to represent the entire network via a single neural tangent kernel. We demonstrate the interpretability of this model on two datasets, showing that the multiple kernels model elucidates the interplay between the layers and predictions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.04945v1-abstract-full').style.display = 'none'; document.getElementById('2110.04945v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.02915">arXiv:2110.02915</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.02915">pdf</a>, <a href="https://arxiv.org/format/2110.02915">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation">stat.CO</span> </div> </div> <p class="title is-5 mathjax"> Unrolling Particles: Unsupervised Learning of Sampling Distributions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gama%2C+F">Fernando Gama</a>, <a href="/search/cs?searchtype=author&amp;query=Zilberstein%2C+N">Nicolas Zilberstein</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Segarra%2C+S">Santiago Segarra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.02915v1-abstract-short" style="display: inline;"> Particle filtering is used to compute good nonlinear estimates of complex systems. It samples trajectories from a chosen distribution and computes the estimate as a weighted average. Easy-to-sample distributions often lead to degenerate samples where only one trajectory carries all the weight, negatively affecting the resulting performance of the estimate. While much research has been done on the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.02915v1-abstract-full').style.display = 'inline'; document.getElementById('2110.02915v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.02915v1-abstract-full" style="display: none;"> Particle filtering is used to compute good nonlinear estimates of complex systems. It samples trajectories from a chosen distribution and computes the estimate as a weighted average. Easy-to-sample distributions often lead to degenerate samples where only one trajectory carries all the weight, negatively affecting the resulting performance of the estimate. While much research has been done on the design of appropriate sampling distributions that would lead to controlled degeneracy, in this paper our objective is to \emph{learn} sampling distributions. Leveraging the framework of algorithm unrolling, we model the sampling distribution as a multivariate normal, and we use neural networks to learn both the mean and the covariance. We carry out unsupervised training of the model to minimize weight degeneracy, relying only on the observed measurements of the system. We show in simulations that the resulting particle filter yields good estimates in a wide range of scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.02915v1-abstract-full').style.display = 'none'; document.getElementById('2110.02915v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.04546">arXiv:2109.04546</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.04546">pdf</a>, <a href="https://arxiv.org/format/2109.04546">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Math Word Problem Generation with Mathematical Consistency and Problem Context Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zichao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+A+S">Andrew S. Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.04546v1-abstract-short" style="display: inline;"> We study the problem of generating arithmetic math word problems (MWPs) given a math equation that specifies the mathematical computation and a context that specifies the problem scenario. Existing approaches are prone to generating MWPs that are either mathematically invalid or have unsatisfactory language quality. They also either ignore the context or require manual specification of a problem t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.04546v1-abstract-full').style.display = 'inline'; document.getElementById('2109.04546v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.04546v1-abstract-full" style="display: none;"> We study the problem of generating arithmetic math word problems (MWPs) given a math equation that specifies the mathematical computation and a context that specifies the problem scenario. Existing approaches are prone to generating MWPs that are either mathematically invalid or have unsatisfactory language quality. They also either ignore the context or require manual specification of a problem template, which compromises the diversity of the generated MWPs. In this paper, we develop a novel MWP generation approach that leverages i) pre-trained language models and a context keyword selection model to improve the language quality of the generated MWPs and ii) an equation consistency constraint for math equations to improve the mathematical validity of the generated MWPs. Extensive quantitative and qualitative experiments on three real-world MWP datasets demonstrate the superior performance of our approach compared to various baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.04546v1-abstract-full').style.display = 'none'; document.getElementById('2109.04546v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.02355">arXiv:2109.02355</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.02355">pdf</a>, <a href="https://arxiv.org/format/2109.02355">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Farewell to the Bias-Variance Tradeoff? An Overview of the Theory of Overparameterized Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dar%2C+Y">Yehuda Dar</a>, <a href="/search/cs?searchtype=author&amp;query=Muthukumar%2C+V">Vidya Muthukumar</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.02355v1-abstract-short" style="display: inline;"> The rapid recent progress in machine learning (ML) has raised a number of scientific questions that challenge the longstanding dogma of the field. One of the most important riddles is the good empirical generalization of overparameterized models. Overparameterized models are excessively complex with respect to the size of the training dataset, which results in them perfectly fitting (i.e., interpo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.02355v1-abstract-full').style.display = 'inline'; document.getElementById('2109.02355v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.02355v1-abstract-full" style="display: none;"> The rapid recent progress in machine learning (ML) has raised a number of scientific questions that challenge the longstanding dogma of the field. One of the most important riddles is the good empirical generalization of overparameterized models. Overparameterized models are excessively complex with respect to the size of the training dataset, which results in them perfectly fitting (i.e., interpolating) the training data, which is usually noisy. Such interpolation of noisy data is traditionally associated with detrimental overfitting, and yet a wide range of interpolating models -- from simple linear models to deep neural networks -- have recently been observed to generalize extremely well on fresh test data. Indeed, the recently discovered double descent phenomenon has revealed that highly overparameterized models often improve over the best underparameterized model in test performance. Understanding learning in this overparameterized regime requires new theory and foundational empirical studies, even for the simplest case of the linear model. The underpinnings of this understanding have been laid in very recent analyses of overparameterized linear regression and related statistical learning tasks, which resulted in precise analytic characterizations of double descent. This paper provides a succinct overview of this emerging theory of overparameterized ML (henceforth abbreviated as TOPML) that explains these recent findings through a statistical signal processing perspective. We emphasize the unique aspects that define the TOPML research area as a subfield of modern ML theory and outline interesting open questions that remain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.02355v1-abstract-full').style.display = 'none'; document.getElementById('2109.02355v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.07769">arXiv:2106.07769</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.07769">pdf</a>, <a href="https://arxiv.org/format/2106.07769">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> The Flip Side of the Reweighted Coin: Duality of Adaptive Dropout and Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=LeJeune%2C+D">Daniel LeJeune</a>, <a href="/search/cs?searchtype=author&amp;query=Javadi%2C+H">Hamid Javadi</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.07769v3-abstract-short" style="display: inline;"> Among the most successful methods for sparsifying deep (neural) networks are those that adaptively mask the network weights throughout training. By examining this masking, or dropout, in the linear case, we uncover a duality between such adaptive methods and regularization through the so-called &#34;$畏$-trick&#34; that casts both as iteratively reweighted optimizations. We show that any dropout strategy t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.07769v3-abstract-full').style.display = 'inline'; document.getElementById('2106.07769v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.07769v3-abstract-full" style="display: none;"> Among the most successful methods for sparsifying deep (neural) networks are those that adaptively mask the network weights throughout training. By examining this masking, or dropout, in the linear case, we uncover a duality between such adaptive methods and regularization through the so-called &#34;$畏$-trick&#34; that casts both as iteratively reweighted optimizations. We show that any dropout strategy that adapts to the weights in a monotonic way corresponds to an effective subquadratic regularization penalty, and therefore leads to sparse solutions. We obtain the effective penalties for several popular sparsification strategies, which are remarkably similar to classical penalties commonly used in sparse optimization. Considering variational dropout as a case study, we demonstrate similar empirical behavior between the adaptive dropout method and classical methods on the task of deep network sparsification, validating our theory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.07769v3-abstract-full').style.display = 'none'; document.getElementById('2106.07769v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 2 figures. Appeared in NeurIPS 2021. Small typographical correction</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.07824">arXiv:2104.07824</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.07824">pdf</a>, <a href="https://arxiv.org/ps/2104.07824">ps</a>, <a href="https://arxiv.org/format/2104.07824">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> NePTuNe: Neural Powered Tucker Network for Knowledge Graph Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sonkar%2C+S">Shashank Sonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Katiyar%2C+A">Arzoo Katiyar</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.07824v1-abstract-short" style="display: inline;"> Knowledge graphs link entities through relations to provide a structured representation of real world facts. However, they are often incomplete, because they are based on only a small fraction of all plausible facts. The task of knowledge graph completion via link prediction aims to overcome this challenge by inferring missing facts represented as links between entities. Current approaches to link&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.07824v1-abstract-full').style.display = 'inline'; document.getElementById('2104.07824v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.07824v1-abstract-full" style="display: none;"> Knowledge graphs link entities through relations to provide a structured representation of real world facts. However, they are often incomplete, because they are based on only a small fraction of all plausible facts. The task of knowledge graph completion via link prediction aims to overcome this challenge by inferring missing facts represented as links between entities. Current approaches to link prediction leverage tensor factorization and/or deep learning. Factorization methods train and deploy rapidly thanks to their small number of parameters but have limited expressiveness due to their underlying linear methodology. Deep learning methods are more expressive but also computationally expensive and prone to overfitting due to their large number of trainable parameters. We propose Neural Powered Tucker Network (NePTuNe), a new hybrid link prediction model that couples the expressiveness of deep models with the speed and size of linear models. We demonstrate that NePTuNe provides state-of-the-art performance on the FB15K-237 dataset and near state-of-the-art performance on the WN18RR dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.07824v1-abstract-full').style.display = 'none'; document.getElementById('2104.07824v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.04034">arXiv:2104.04034</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.04034">pdf</a>, <a href="https://arxiv.org/format/2104.04034">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Results and Insights from Diagnostic Questions: The NeurIPS 2020 Education Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zichao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lamb%2C+A">Angus Lamb</a>, <a href="/search/cs?searchtype=author&amp;query=Saveliev%2C+E">Evgeny Saveliev</a>, <a href="/search/cs?searchtype=author&amp;query=Cameron%2C+P">Pashmina Cameron</a>, <a href="/search/cs?searchtype=author&amp;query=Zaykov%2C+Y">Yordan Zaykov</a>, <a href="/search/cs?searchtype=author&amp;query=Hernandez-Lobato%2C+J+M">Jose Miguel Hernandez-Lobato</a>, <a href="/search/cs?searchtype=author&amp;query=Turner%2C+R+E">Richard E. Turner</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a>, <a href="/search/cs?searchtype=author&amp;query=Barton%2C+C">Craig Barton</a>, <a href="/search/cs?searchtype=author&amp;query=Jones%2C+S+P">Simon Peyton Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Woodhead%2C+S">Simon Woodhead</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Cheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.04034v1-abstract-short" style="display: inline;"> This competition concerns educational diagnostic questions, which are pedagogically effective, multiple-choice questions (MCQs) whose distractors embody misconceptions. With a large and ever-increasing number of such questions, it becomes overwhelming for teachers to know which questions are the best ones to use for their students. We thus seek to answer the following question: how can we use data&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.04034v1-abstract-full').style.display = 'inline'; document.getElementById('2104.04034v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.04034v1-abstract-full" style="display: none;"> This competition concerns educational diagnostic questions, which are pedagogically effective, multiple-choice questions (MCQs) whose distractors embody misconceptions. With a large and ever-increasing number of such questions, it becomes overwhelming for teachers to know which questions are the best ones to use for their students. We thus seek to answer the following question: how can we use data on hundreds of millions of answers to MCQs to drive automatic personalized learning in large-scale learning scenarios where manual personalization is infeasible? Success in using MCQ data at scale helps build more intelligent, personalized learning platforms that ultimately improve the quality of education en masse. To this end, we introduce a new, large-scale, real-world dataset and formulate 4 data mining tasks on MCQs that mimic real learning scenarios and target various aspects of the above question in a competition setting at NeurIPS 2020. We report on our NeurIPS competition in which nearly 400 teams submitted approximately 4000 submissions, with encouragingly diverse and effective approaches to each of our tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.04034v1-abstract-full').style.display = 'none'; document.getElementById('2104.04034v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2007.12061</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.05621">arXiv:2103.05621</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.05621">pdf</a>, <a href="https://arxiv.org/format/2103.05621">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Common Intuition to Transfer Learning Can Win or Lose: Case Studies for Linear Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dar%2C+Y">Yehuda Dar</a>, <a href="/search/cs?searchtype=author&amp;query=LeJeune%2C+D">Daniel LeJeune</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.05621v4-abstract-short" style="display: inline;"> We study a fundamental transfer learning process from source to target linear regression tasks, including overparameterized settings where there are more learned parameters than data samples. The target task learning is addressed by using its training data together with the parameters previously computed for the source task. We define a transfer learning approach to the target task as a linear reg&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.05621v4-abstract-full').style.display = 'inline'; document.getElementById('2103.05621v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.05621v4-abstract-full" style="display: none;"> We study a fundamental transfer learning process from source to target linear regression tasks, including overparameterized settings where there are more learned parameters than data samples. The target task learning is addressed by using its training data together with the parameters previously computed for the source task. We define a transfer learning approach to the target task as a linear regression optimization with a regularization on the distance between the to-be-learned target parameters and the already-learned source parameters. We analytically characterize the generalization performance of our transfer learning approach and demonstrate its ability to resolve the peak in generalization errors in double descent phenomena of the minimum L2-norm solution to linear regression. Moreover, we show that for sufficiently related tasks, the optimally tuned transfer learning approach can outperform the optimally tuned ridge regression method, even when the true parameter vector conforms to an isotropic Gaussian prior distribution. Namely, we demonstrate that transfer learning can beat the minimum mean square error (MMSE) solution of the independent target task. Our results emphasize the ability of transfer learning to extend the solution space to the target task and, by that, to have an improved MMSE solution. We formulate the linear MMSE solution to our transfer learning setting and point out its key differences from the common design philosophy to transfer learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.05621v4-abstract-full').style.display = 'none'; document.getElementById('2103.05621v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.13975">arXiv:2010.13975</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.13975">pdf</a>, <a href="https://arxiv.org/format/2010.13975">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Wearing a MASK: Compressed Representations of Variable-Length Sequences Using Recurrent Neural Tangent Kernels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alemohammad%2C+S">Sina Alemohammad</a>, <a href="/search/cs?searchtype=author&amp;query=Babaei%2C+H">Hossein Babaei</a>, <a href="/search/cs?searchtype=author&amp;query=Balestriero%2C+R">Randall Balestriero</a>, <a href="/search/cs?searchtype=author&amp;query=Cheung%2C+M+Y">Matt Y. Cheung</a>, <a href="/search/cs?searchtype=author&amp;query=Humayun%2C+A+I">Ahmed Imtiaz Humayun</a>, <a href="/search/cs?searchtype=author&amp;query=LeJeune%2C+D">Daniel LeJeune</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Naiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Luzi%2C+L">Lorenzo Luzi</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+J">Jasper Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zichao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Baraniuk%2C+R+G">Richard G. Baraniuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.13975v2-abstract-short" style="display: inline;"> High dimensionality poses many challenges to the use of data, from visualization and interpretation, to prediction and storage for historical preservation. Techniques abound to reduce the dimensionality of fixed-length sequences, yet these methods rarely generalize to variable-length sequences. To address this gap, we extend existing methods that rely on the use of kernels to variable-length seque&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.13975v2-abstract-full').style.display = 'inline'; document.getElementById('2010.13975v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.13975v2-abstract-full" style="display: none;"> High dimensionality poses many challenges to the use of data, from visualization and interpretation, to prediction and storage for historical preservation. Techniques abound to reduce the dimensionality of fixed-length sequences, yet these methods rarely generalize to variable-length sequences. To address this gap, we extend existing methods that rely on the use of kernels to variable-length sequences via use of the Recurrent Neural Tangent Kernel (RNTK). Since a deep neural network with ReLu activation is a Max-Affine Spline Operator (MASO), we dub our approach Max-Affine Spline Kernel (MASK). We demonstrate how MASK can be used to extend principal components analysis (PCA) and t-distributed stochastic neighbor embedding (t-SNE) and apply these new algorithms to separate synthetic time series data sampled from second-order differential equations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.13975v2-abstract-full').style.display = 'none'; document.getElementById('2010.13975v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Baraniuk%2C+R+G&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Baraniuk%2C+R+G&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Baraniuk%2C+R+G&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Baraniuk%2C+R+G&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10