CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–35 of 35 results for author: <span class="mathjax">Michael, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Michael%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Michael, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Michael%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Michael, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17805">arXiv:2501.17805</a> <span> [<a href="https://arxiv.org/pdf/2501.17805">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> International AI Safety Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/cs?searchtype=author&query=Mindermann%2C+S">S枚ren Mindermann</a>, <a href="/search/cs?searchtype=author&query=Privitera%2C+D">Daniel Privitera</a>, <a href="/search/cs?searchtype=author&query=Besiroglu%2C+T">Tamay Besiroglu</a>, <a href="/search/cs?searchtype=author&query=Bommasani%2C+R">Rishi Bommasani</a>, <a href="/search/cs?searchtype=author&query=Casper%2C+S">Stephen Casper</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+P">Philip Fox</a>, <a href="/search/cs?searchtype=author&query=Garfinkel%2C+B">Ben Garfinkel</a>, <a href="/search/cs?searchtype=author&query=Goldfarb%2C+D">Danielle Goldfarb</a>, <a href="/search/cs?searchtype=author&query=Heidari%2C+H">Hoda Heidari</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+A">Anson Ho</a>, <a href="/search/cs?searchtype=author&query=Kapoor%2C+S">Sayash Kapoor</a>, <a href="/search/cs?searchtype=author&query=Khalatbari%2C+L">Leila Khalatbari</a>, <a href="/search/cs?searchtype=author&query=Longpre%2C+S">Shayne Longpre</a>, <a href="/search/cs?searchtype=author&query=Manning%2C+S">Sam Manning</a>, <a href="/search/cs?searchtype=author&query=Mavroudis%2C+V">Vasilios Mavroudis</a>, <a href="/search/cs?searchtype=author&query=Mazeika%2C+M">Mantas Mazeika</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Newman%2C+J">Jessica Newman</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+K+Y">Kwan Yee Ng</a>, <a href="/search/cs?searchtype=author&query=Okolo%2C+C+T">Chinasa T. Okolo</a>, <a href="/search/cs?searchtype=author&query=Raji%2C+D">Deborah Raji</a>, <a href="/search/cs?searchtype=author&query=Sastry%2C+G">Girish Sastry</a>, <a href="/search/cs?searchtype=author&query=Seger%2C+E">Elizabeth Seger</a> , et al. (71 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17805v1-abstract-short" style="display: inline;"> The first International AI Safety Report comprehensively synthesizes the current evidence on the capabilities, risks, and safety of advanced AI systems. The report was mandated by the nations attending the AI Safety Summit in Bletchley, UK. Thirty nations, the UN, the OECD, and the EU each nominated a representative to the report's Expert Advisory Panel. A total of 100 AI experts contributed, repr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17805v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17805v1-abstract-full" style="display: none;"> The first International AI Safety Report comprehensively synthesizes the current evidence on the capabilities, risks, and safety of advanced AI systems. The report was mandated by the nations attending the AI Safety Summit in Bletchley, UK. Thirty nations, the UN, the OECD, and the EU each nominated a representative to the report's Expert Advisory Panel. A total of 100 AI experts contributed, representing diverse perspectives and disciplines. Led by the report's Chair, these independent experts collectively had full discretion over the report's content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17805v1-abstract-full').style.display = 'none'; document.getElementById('2501.17805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14093">arXiv:2412.14093</a> <span> [<a href="https://arxiv.org/pdf/2412.14093">pdf</a>, <a href="https://arxiv.org/format/2412.14093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Alignment faking in large language models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Greenblatt%2C+R">Ryan Greenblatt</a>, <a href="/search/cs?searchtype=author&query=Denison%2C+C">Carson Denison</a>, <a href="/search/cs?searchtype=author&query=Wright%2C+B">Benjamin Wright</a>, <a href="/search/cs?searchtype=author&query=Roger%2C+F">Fabien Roger</a>, <a href="/search/cs?searchtype=author&query=MacDiarmid%2C+M">Monte MacDiarmid</a>, <a href="/search/cs?searchtype=author&query=Marks%2C+S">Sam Marks</a>, <a href="/search/cs?searchtype=author&query=Treutlein%2C+J">Johannes Treutlein</a>, <a href="/search/cs?searchtype=author&query=Belonax%2C+T">Tim Belonax</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jack Chen</a>, <a href="/search/cs?searchtype=author&query=Duvenaud%2C+D">David Duvenaud</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+A">Akbir Khan</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Mindermann%2C+S">S枚ren Mindermann</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&query=Petrini%2C+L">Linda Petrini</a>, <a href="/search/cs?searchtype=author&query=Uesato%2C+J">Jonathan Uesato</a>, <a href="/search/cs?searchtype=author&query=Kaplan%2C+J">Jared Kaplan</a>, <a href="/search/cs?searchtype=author&query=Shlegeris%2C+B">Buck Shlegeris</a>, <a href="/search/cs?searchtype=author&query=Bowman%2C+S+R">Samuel R. Bowman</a>, <a href="/search/cs?searchtype=author&query=Hubinger%2C+E">Evan Hubinger</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14093v2-abstract-short" style="display: inline;"> We present a demonstration of a large language model engaging in alignment faking: selectively complying with its training objective in training to prevent modification of its behavior out of training. First, we give Claude 3 Opus a system prompt stating it is being trained to answer all queries, even harmful ones, which conflicts with its prior training to refuse such queries. To allow the model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14093v2-abstract-full').style.display = 'inline'; document.getElementById('2412.14093v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14093v2-abstract-full" style="display: none;"> We present a demonstration of a large language model engaging in alignment faking: selectively complying with its training objective in training to prevent modification of its behavior out of training. First, we give Claude 3 Opus a system prompt stating it is being trained to answer all queries, even harmful ones, which conflicts with its prior training to refuse such queries. To allow the model to infer when it is in training, we say it will be trained only on conversations with free users, not paid users. We find the model complies with harmful queries from free users 14% of the time, versus almost never for paid users. Explaining this gap, in almost all cases where the model complies with a harmful query from a free user, we observe explicit alignment-faking reasoning, with the model stating it is strategically answering harmful queries in training to preserve its preferred harmlessness behavior out of training. Next, we study a more realistic setting where information about the training process is provided not in a system prompt, but by training on synthetic documents that mimic pre-training data--and observe similar alignment faking. Finally, we study the effect of actually training the model to comply with harmful queries via reinforcement learning, which we find increases the rate of alignment-faking reasoning to 78%, though also increases compliance even out of training. We additionally observe other behaviors such as the model exfiltrating its weights when given an easy opportunity. While we made alignment faking easier by telling the model when and by what criteria it was being trained, we did not instruct the model to fake alignment or give it any explicit goal. As future models might infer information about their training process without being told, our results suggest a risk of alignment faking in future models, whether due to a benign preference--as in this case--or not. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14093v2-abstract-full').style.display = 'none'; document.getElementById('2412.14093v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07494">arXiv:2411.07494</a> <span> [<a href="https://arxiv.org/pdf/2411.07494">pdf</a>, <a href="https://arxiv.org/format/2411.07494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Rapid Response: Mitigating LLM Jailbreaks with a Few Examples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+A">Alwin Peng</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Sleight%2C+H">Henry Sleight</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+M">Mrinank Sharma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07494v1-abstract-short" style="display: inline;"> As large language models (LLMs) grow more powerful, ensuring their safety against misuse becomes crucial. While researchers have focused on developing robust defenses, no method has yet achieved complete invulnerability to attacks. We propose an alternative approach: instead of seeking perfect adversarial robustness, we develop rapid response techniques to look to block whole classes of jailbreaks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07494v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07494v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07494v1-abstract-full" style="display: none;"> As large language models (LLMs) grow more powerful, ensuring their safety against misuse becomes crucial. While researchers have focused on developing robust defenses, no method has yet achieved complete invulnerability to attacks. We propose an alternative approach: instead of seeking perfect adversarial robustness, we develop rapid response techniques to look to block whole classes of jailbreaks after observing only a handful of attacks. To study this setting, we develop RapidResponseBench, a benchmark that measures a defense's robustness against various jailbreak strategies after adapting to a few observed examples. We evaluate five rapid response methods, all of which use jailbreak proliferation, where we automatically generate additional jailbreaks similar to the examples observed. Our strongest method, which fine-tunes an input classifier to block proliferated jailbreaks, reduces attack success rate by a factor greater than 240 on an in-distribution set of jailbreaks and a factor greater than 15 on an out-of-distribution set, having observed just one example of each jailbreaking strategy. Moreover, further studies suggest that the quality of proliferation model and number of proliferated examples play an key role in the effectiveness of this defense. Overall, our results highlight the potential of responding rapidly to novel jailbreaks to limit LLM misuse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07494v1-abstract-full').style.display = 'none'; document.getElementById('2411.07494v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16636">arXiv:2409.16636</a> <span> [<a href="https://arxiv.org/pdf/2409.16636">pdf</a>, <a href="https://arxiv.org/format/2409.16636">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Training Language Models to Win Debates with Self-Play Improves Judge Accuracy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Arnesen%2C+S">Samuel Arnesen</a>, <a href="/search/cs?searchtype=author&query=Rein%2C+D">David Rein</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16636v1-abstract-short" style="display: inline;"> We test the robustness of debate as a method of scalable oversight by training models to debate with data generated via self-play. In a long-context reading comprehension task, we find that language model based evaluators answer questions more accurately when judging models optimized to win debates. By contrast, we find no such relationship for consultancy models trained to persuade a judge withou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16636v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16636v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16636v1-abstract-full" style="display: none;"> We test the robustness of debate as a method of scalable oversight by training models to debate with data generated via self-play. In a long-context reading comprehension task, we find that language model based evaluators answer questions more accurately when judging models optimized to win debates. By contrast, we find no such relationship for consultancy models trained to persuade a judge without an opposing debater present. In quantitative and qualitative comparisons between our debate models and novel consultancy baselines, we find evidence that debate training encourages stronger and more informative arguments, showing promise that it can help provide high-quality supervision for tasks that are difficult to directly evaluate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16636v1-abstract-full').style.display = 'none'; document.getElementById('2409.16636v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">48 pages, 12 figures; code at https://github.com/samuelarnesen/nyu-debate-modeling</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0; I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01502">arXiv:2405.01502</a> <span> [<a href="https://arxiv.org/pdf/2405.01502">pdf</a>, <a href="https://arxiv.org/format/2405.01502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Analyzing the Role of Semantic Representations in the Era of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhijing Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuen Chen</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+F">Fernando Gonzalez</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiarui Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiayi Zhang</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Sch%C3%B6lkopf%2C+B">Bernhard Sch枚lkopf</a>, <a href="/search/cs?searchtype=author&query=Diab%2C+M">Mona Diab</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01502v1-abstract-short" style="display: inline;"> Traditionally, natural language processing (NLP) models often use a rich set of features created by linguistic expertise, such as semantic representations. However, in the era of large language models (LLMs), more and more tasks are turned into generic, end-to-end sequence generation problems. In this paper, we investigate the question: what is the role of semantic representations in the era of LL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01502v1-abstract-full').style.display = 'inline'; document.getElementById('2405.01502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01502v1-abstract-full" style="display: none;"> Traditionally, natural language processing (NLP) models often use a rich set of features created by linguistic expertise, such as semantic representations. However, in the era of large language models (LLMs), more and more tasks are turned into generic, end-to-end sequence generation problems. In this paper, we investigate the question: what is the role of semantic representations in the era of LLMs? Specifically, we investigate the effect of Abstract Meaning Representation (AMR) across five diverse NLP tasks. We propose an AMR-driven chain-of-thought prompting method, which we call AMRCoT, and find that it generally hurts performance more than it helps. To investigate what AMR may have to offer on these tasks, we conduct a series of analysis experiments. We find that it is difficult to predict which input examples AMR may help or hurt on, but errors tend to arise with multi-word expressions, named entities, and in the final inference step where the LLM must connect its reasoning over the AMR to its prediction. We recommend focusing on these areas for future work in semantic representations for LLMs. Our code: https://github.com/causalNLP/amr_llm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01502v1-abstract-full').style.display = 'none'; document.getElementById('2405.01502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.07162">arXiv:2403.07162</a> <span> [<a href="https://arxiv.org/pdf/2403.07162">pdf</a>, <a href="https://arxiv.org/format/2403.07162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Digital Twin Evolution for Sustainable Smart Ecosystems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Michael%2C+J">Judith Michael</a>, <a href="/search/cs?searchtype=author&query=David%2C+I">Istvan David</a>, <a href="/search/cs?searchtype=author&query=Bork%2C+D">Dominik Bork</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.07162v3-abstract-short" style="display: inline;"> Smart ecosystems are the drivers of modern society. They control infrastructures of socio-techno-economic importance, ensuring their stable and sustainable operation. Smart ecosystems are governed by digital twins -- real-time virtual representations of physical infrastructure. To support the open-ended and reactive traits of smart ecosystems, digital twins need to be able to evolve in reaction to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07162v3-abstract-full').style.display = 'inline'; document.getElementById('2403.07162v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.07162v3-abstract-full" style="display: none;"> Smart ecosystems are the drivers of modern society. They control infrastructures of socio-techno-economic importance, ensuring their stable and sustainable operation. Smart ecosystems are governed by digital twins -- real-time virtual representations of physical infrastructure. To support the open-ended and reactive traits of smart ecosystems, digital twins need to be able to evolve in reaction to changing conditions. However, digital twin evolution is challenged by the intertwined nature of physical and software components, and their individual evolution. As a consequence, software practitioners find a substantial body of knowledge on software evolution hard to apply in digital twin evolution scenarios and a lack of knowledge on the digital twin evolution itself. The aim of this paper, consequently, is to provide software practitioners with tangible leads toward understanding and managing the evolutionary concerns of digital twins. We use four distinct digital twin evolution scenarios, contextualized in a citizen energy community case to illustrate the usage of the 7R taxonomy of digital twin evolution. By that, we aim to bridge a significant gap in leveraging software engineering practices to develop robust smart ecosystems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07162v3-abstract-full').style.display = 'none'; document.getElementById('2403.07162v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05518">arXiv:2403.05518</a> <span> [<a href="https://arxiv.org/pdf/2403.05518">pdf</a>, <a href="https://arxiv.org/format/2403.05518">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bias-Augmented Consistency Training Reduces Biased Reasoning in Chain-of-Thought </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chua%2C+J">James Chua</a>, <a href="/search/cs?searchtype=author&query=Rees%2C+E">Edward Rees</a>, <a href="/search/cs?searchtype=author&query=Batra%2C+H">Hunar Batra</a>, <a href="/search/cs?searchtype=author&query=Bowman%2C+S+R">Samuel R. Bowman</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&query=Turpin%2C+M">Miles Turpin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05518v1-abstract-short" style="display: inline;"> While chain-of-thought prompting (CoT) has the potential to improve the explainability of language model reasoning, it can systematically misrepresent the factors influencing models' behavior--for example, rationalizing answers in line with a user's opinion without mentioning this bias. To mitigate this biased reasoning problem, we introduce bias-augmented consistency training (BCT), an unsupervis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05518v1-abstract-full').style.display = 'inline'; document.getElementById('2403.05518v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05518v1-abstract-full" style="display: none;"> While chain-of-thought prompting (CoT) has the potential to improve the explainability of language model reasoning, it can systematically misrepresent the factors influencing models' behavior--for example, rationalizing answers in line with a user's opinion without mentioning this bias. To mitigate this biased reasoning problem, we introduce bias-augmented consistency training (BCT), an unsupervised fine-tuning scheme that trains models to give consistent reasoning across prompts with and without biasing features. We construct a suite testing nine forms of biased reasoning on seven question-answering tasks, and find that applying BCT to GPT-3.5-Turbo with one bias reduces the rate of biased reasoning by 86% on held-out tasks. Moreover, this model generalizes to other forms of bias, reducing biased reasoning on held-out biases by an average of 37%. As BCT generalizes to held-out biases and does not require gold labels, this method may hold promise for reducing biased reasoning from as-of-yet unknown biases and on tasks where supervision for ground truth reasoning is unavailable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05518v1-abstract-full').style.display = 'none'; document.getElementById('2403.05518v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.07791">arXiv:2402.07791</a> <span> [<a href="https://arxiv.org/pdf/2402.07791">pdf</a>, <a href="https://arxiv.org/format/2402.07791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Discovering Decision Manifolds to Assure Trusted Autonomous Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Litton%2C+M">Matthew Litton</a>, <a href="/search/cs?searchtype=author&query=Drusinsky%2C+D">Doron Drusinsky</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J+B">James Bret Michael</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.07791v2-abstract-short" style="display: inline;"> Developing and fielding complex systems requires proof that they are reliably correct with respect to their design and operating requirements. Especially for autonomous systems which exhibit unanticipated emergent behavior, fully enumerating the range of possible correct and incorrect behaviors is intractable. Therefore, we propose an optimization-based search technique for generating high-quality… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07791v2-abstract-full').style.display = 'inline'; document.getElementById('2402.07791v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.07791v2-abstract-full" style="display: none;"> Developing and fielding complex systems requires proof that they are reliably correct with respect to their design and operating requirements. Especially for autonomous systems which exhibit unanticipated emergent behavior, fully enumerating the range of possible correct and incorrect behaviors is intractable. Therefore, we propose an optimization-based search technique for generating high-quality, high-variance, and non-trivial data which captures the range of correct and incorrect responses a system could exhibit. This manifold between desired and undesired behavior provides a more detailed understanding of system reliability than traditional testing or Monte Carlo simulations. After discovering data points along the manifold, we apply machine learning techniques to quantify the decision manifold's underlying mathematical function. Such models serve as correctness properties which can be utilized to enable both verification during development and testing, as well as continuous assurance during operation, even amidst system adaptations and dynamic operating environments. This method can be applied in combination with a simulator in order to provide evidence of dependability to system designers and users, with the ultimate aim of establishing trust in the deployment of complex systems. In this proof-of-concept, we apply our method to a software-in-the-loop evaluation of an autonomous vehicle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07791v2-abstract-full').style.display = 'none'; document.getElementById('2402.07791v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.00349">arXiv:2312.00349</a> <span> [<a href="https://arxiv.org/pdf/2312.00349">pdf</a>, <a href="https://arxiv.org/format/2312.00349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Case for Scalable, Data-Driven Theory: A Paradigm for Scientific Progress in NLP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.00349v1-abstract-short" style="display: inline;"> I propose a paradigm for scientific progress in NLP centered around developing scalable, data-driven theories of linguistic structure. The idea is to collect data in tightly scoped, carefully defined ways which allow for exhaustive annotation of behavioral phenomena of interest, and then use machine learning to construct explanatory theories of these phenomena which can form building blocks for in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00349v1-abstract-full').style.display = 'inline'; document.getElementById('2312.00349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.00349v1-abstract-full" style="display: none;"> I propose a paradigm for scientific progress in NLP centered around developing scalable, data-driven theories of linguistic structure. The idea is to collect data in tightly scoped, carefully defined ways which allow for exhaustive annotation of behavioral phenomena of interest, and then use machine learning to construct explanatory theories of these phenomena which can form building blocks for intelligible AI systems. After laying some conceptual groundwork, I describe several investigations into data-driven theories of shallow semantic structure using Question-Answer driven Semantic Role Labeling (QA-SRL), a schema for annotating verbal predicate-argument relations using highly constrained question-answer pairs. While this only scratches the surface of the complex language behaviors of interest in AI, I outline principles for data collection and theoretical modeling which can inform future scientific progress. This note summarizes and draws heavily on my PhD thesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00349v1-abstract-full').style.display = 'none'; document.getElementById('2312.00349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 3 figures, 2 tables. Presented at The Big Picture Workshop at EMNLP 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.12022">arXiv:2311.12022</a> <span> [<a href="https://arxiv.org/pdf/2311.12022">pdf</a>, <a href="https://arxiv.org/format/2311.12022">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GPQA: A Graduate-Level Google-Proof Q&A Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rein%2C+D">David Rein</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+B+L">Betty Li Hou</a>, <a href="/search/cs?searchtype=author&query=Stickland%2C+A+C">Asa Cooper Stickland</a>, <a href="/search/cs?searchtype=author&query=Petty%2C+J">Jackson Petty</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+R+Y">Richard Yuanzhe Pang</a>, <a href="/search/cs?searchtype=author&query=Dirani%2C+J">Julien Dirani</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Bowman%2C+S+R">Samuel R. Bowman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.12022v1-abstract-short" style="display: inline;"> We present GPQA, a challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. We ensure that the questions are high-quality and extremely difficult: experts who have or are pursuing PhDs in the corresponding domains reach 65% accuracy (74% when discounting clear mistakes the experts identified in retrospect), while highly skilled non-expert v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12022v1-abstract-full').style.display = 'inline'; document.getElementById('2311.12022v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.12022v1-abstract-full" style="display: none;"> We present GPQA, a challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. We ensure that the questions are high-quality and extremely difficult: experts who have or are pursuing PhDs in the corresponding domains reach 65% accuracy (74% when discounting clear mistakes the experts identified in retrospect), while highly skilled non-expert validators only reach 34% accuracy, despite spending on average over 30 minutes with unrestricted access to the web (i.e., the questions are "Google-proof"). The questions are also difficult for state-of-the-art AI systems, with our strongest GPT-4 based baseline achieving 39% accuracy. If we are to use future AI systems to help us answer very hard questions, for example, when developing new scientific knowledge, we need to develop scalable oversight methods that enable humans to supervise their outputs, which may be difficult even if the supervisors are themselves skilled and knowledgeable. The difficulty of GPQA both for skilled non-experts and frontier AI systems should enable realistic scalable oversight experiments, which we hope can help devise ways for human experts to reliably get truthful information from AI systems that surpass human capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12022v1-abstract-full').style.display = 'none'; document.getElementById('2311.12022v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 5 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.08702">arXiv:2311.08702</a> <span> [<a href="https://arxiv.org/pdf/2311.08702">pdf</a>, <a href="https://arxiv.org/format/2311.08702">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Debate Helps Supervise Unreliable Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Mahdi%2C+S">Salsabila Mahdi</a>, <a href="/search/cs?searchtype=author&query=Rein%2C+D">David Rein</a>, <a href="/search/cs?searchtype=author&query=Petty%2C+J">Jackson Petty</a>, <a href="/search/cs?searchtype=author&query=Dirani%2C+J">Julien Dirani</a>, <a href="/search/cs?searchtype=author&query=Padmakumar%2C+V">Vishakh Padmakumar</a>, <a href="/search/cs?searchtype=author&query=Bowman%2C+S+R">Samuel R. Bowman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.08702v1-abstract-short" style="display: inline;"> As AI systems are used to answer more difficult questions and potentially help create new knowledge, judging the truthfulness of their outputs becomes more difficult and more important. How can we supervise unreliable experts, which have access to the truth but may not accurately report it, to give answers that are systematically true and don't just superficially seem true, when the supervisor can… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08702v1-abstract-full').style.display = 'inline'; document.getElementById('2311.08702v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.08702v1-abstract-full" style="display: none;"> As AI systems are used to answer more difficult questions and potentially help create new knowledge, judging the truthfulness of their outputs becomes more difficult and more important. How can we supervise unreliable experts, which have access to the truth but may not accurately report it, to give answers that are systematically true and don't just superficially seem true, when the supervisor can't tell the difference between the two on their own? In this work, we show that debate between two unreliable experts can help a non-expert judge more reliably identify the truth. We collect a dataset of human-written debates on hard reading comprehension questions where the judge has not read the source passage, only ever seeing expert arguments and short quotes selectively revealed by 'expert' debaters who have access to the passage. In our debates, one expert argues for the correct answer, and the other for an incorrect answer. Comparing debate to a baseline we call consultancy, where a single expert argues for only one answer which is correct half of the time, we find that debate performs significantly better, with 84% judge accuracy compared to consultancy's 74%. Debates are also more efficient, being 68% of the length of consultancies. By comparing human to AI debaters, we find evidence that with more skilled (in this case, human) debaters, the performance of debate goes up but the performance of consultancy goes down. Our error analysis also supports this trend, with 46% of errors in human debate attributable to mistakes by the honest debater (which should go away with increased skill); whereas 52% of errors in human consultancy are due to debaters obfuscating the relevant evidence from the judge (which should become worse with increased skill). Overall, these results show that debate is a promising approach for supervising increasingly capable but potentially unreliable AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08702v1-abstract-full').style.display = 'none'; document.getElementById('2311.08702v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">84 pages, 13 footnotes, 5 figures, 4 tables, 28 debate transcripts; data and code at https://github.com/julianmichael/debate/tree/2023-nyu-experiments</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.04388">arXiv:2305.04388</a> <span> [<a href="https://arxiv.org/pdf/2305.04388">pdf</a>, <a href="https://arxiv.org/format/2305.04388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Language Models Don't Always Say What They Think: Unfaithful Explanations in Chain-of-Thought Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Turpin%2C+M">Miles Turpin</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&query=Bowman%2C+S+R">Samuel R. Bowman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.04388v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) can achieve strong performance on many tasks by producing step-by-step reasoning before giving a final output, often referred to as chain-of-thought reasoning (CoT). It is tempting to interpret these CoT explanations as the LLM's process for solving a task. This level of transparency into LLMs' predictions would yield significant safety benefits. However, we find that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04388v2-abstract-full').style.display = 'inline'; document.getElementById('2305.04388v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.04388v2-abstract-full" style="display: none;"> Large Language Models (LLMs) can achieve strong performance on many tasks by producing step-by-step reasoning before giving a final output, often referred to as chain-of-thought reasoning (CoT). It is tempting to interpret these CoT explanations as the LLM's process for solving a task. This level of transparency into LLMs' predictions would yield significant safety benefits. However, we find that CoT explanations can systematically misrepresent the true reason for a model's prediction. We demonstrate that CoT explanations can be heavily influenced by adding biasing features to model inputs--e.g., by reordering the multiple-choice options in a few-shot prompt to make the answer always "(A)"--which models systematically fail to mention in their explanations. When we bias models toward incorrect answers, they frequently generate CoT explanations rationalizing those answers. This causes accuracy to drop by as much as 36% on a suite of 13 tasks from BIG-Bench Hard, when testing with GPT-3.5 from OpenAI and Claude 1.0 from Anthropic. On a social-bias task, model explanations justify giving answers in line with stereotypes without mentioning the influence of these social biases. Our findings indicate that CoT explanations can be plausible yet misleading, which risks increasing our trust in LLMs without guaranteeing their safety. Building more transparent and explainable systems will require either improving CoT faithfulness through targeted efforts or abandoning CoT in favor of alternative methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04388v2-abstract-full').style.display = 'none'; document.getElementById('2305.04388v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.14399">arXiv:2304.14399</a> <span> [<a href="https://arxiv.org/pdf/2304.14399">pdf</a>, <a href="https://arxiv.org/format/2304.14399">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> We're Afraid Language Models Aren't Modeling Ambiguity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+A">Alisa Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhaofeng Wu</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Suhr%2C+A">Alane Suhr</a>, <a href="/search/cs?searchtype=author&query=West%2C+P">Peter West</a>, <a href="/search/cs?searchtype=author&query=Koller%2C+A">Alexander Koller</a>, <a href="/search/cs?searchtype=author&query=Swayamdipta%2C+S">Swabha Swayamdipta</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.14399v2-abstract-short" style="display: inline;"> Ambiguity is an intrinsic feature of natural language. Managing ambiguity is a key part of human language understanding, allowing us to anticipate misunderstanding as communicators and revise our interpretations as listeners. As language models (LMs) are increasingly employed as dialogue interfaces and writing aids, handling ambiguous language is critical to their success. We characterize ambiguit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14399v2-abstract-full').style.display = 'inline'; document.getElementById('2304.14399v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.14399v2-abstract-full" style="display: none;"> Ambiguity is an intrinsic feature of natural language. Managing ambiguity is a key part of human language understanding, allowing us to anticipate misunderstanding as communicators and revise our interpretations as listeners. As language models (LMs) are increasingly employed as dialogue interfaces and writing aids, handling ambiguous language is critical to their success. We characterize ambiguity in a sentence by its effect on entailment relations with another sentence, and collect AmbiEnt, a linguist-annotated benchmark of 1,645 examples with diverse kinds of ambiguity. We design a suite of tests based on AmbiEnt, presenting the first evaluation of pretrained LMs to recognize ambiguity and disentangle possible meanings. We find that the task remains extremely challenging, including for GPT-4, whose generated disambiguations are considered correct only 32% of the time in human evaluation, compared to 90% for disambiguations in our dataset. Finally, to illustrate the value of ambiguity-sensitive tools, we show that a multilabel NLI model can flag political claims in the wild that are misleading due to ambiguity. We encourage the field to rediscover the importance of ambiguity for NLP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14399v2-abstract-full').style.display = 'none'; document.getElementById('2304.14399v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2023 camera-ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.12852">arXiv:2208.12852</a> <span> [<a href="https://arxiv.org/pdf/2208.12852">pdf</a>, <a href="https://arxiv.org/format/2208.12852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> What Do NLP Researchers Believe? Results of the NLP Community Metasurvey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Holtzman%2C+A">Ari Holtzman</a>, <a href="/search/cs?searchtype=author&query=Parrish%2C+A">Alicia Parrish</a>, <a href="/search/cs?searchtype=author&query=Mueller%2C+A">Aaron Mueller</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Alex Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Angelica Chen</a>, <a href="/search/cs?searchtype=author&query=Madaan%2C+D">Divyam Madaan</a>, <a href="/search/cs?searchtype=author&query=Nangia%2C+N">Nikita Nangia</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+R+Y">Richard Yuanzhe Pang</a>, <a href="/search/cs?searchtype=author&query=Phang%2C+J">Jason Phang</a>, <a href="/search/cs?searchtype=author&query=Bowman%2C+S+R">Samuel R. Bowman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.12852v1-abstract-short" style="display: inline;"> We present the results of the NLP Community Metasurvey. Run from May to June 2022, the survey elicited opinions on controversial issues, including industry influence in the field, concerns about AGI, and ethics. Our results put concrete numbers to several controversies: For example, respondents are split almost exactly in half on questions about the importance of artificial general intelligence, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.12852v1-abstract-full').style.display = 'inline'; document.getElementById('2208.12852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.12852v1-abstract-full" style="display: none;"> We present the results of the NLP Community Metasurvey. Run from May to June 2022, the survey elicited opinions on controversial issues, including industry influence in the field, concerns about AGI, and ethics. Our results put concrete numbers to several controversies: For example, respondents are split almost exactly in half on questions about the importance of artificial general intelligence, whether language models understand language, and the necessity of linguistic structure and inductive bias for solving NLP problems. In addition, the survey posed meta-questions, asking respondents to predict the distribution of survey responses. This allows us not only to gain insight on the spectrum of beliefs held by NLP researchers, but also to uncover false sociological beliefs where the community's predictions don't match reality. We find such mismatches on a wide range of issues. Among other results, the community greatly overestimates its own belief in the usefulness of benchmarks and the potential for scaling to solve real-world problems, while underestimating its own belief in the importance of linguistic structure, inductive bias, and interdisciplinary science. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.12852v1-abstract-full').style.display = 'none'; document.getElementById('2208.12852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 19 figures, 3 tables; more information at https://nlpsurvey.net</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.12492">arXiv:2206.12492</a> <span> [<a href="https://arxiv.org/pdf/2206.12492">pdf</a>, <a href="https://arxiv.org/format/2206.12492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Guidelines for Artifacts to Support Industry-Relevant Research on Self-Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weyns%2C+D">Danny Weyns</a>, <a href="/search/cs?searchtype=author&query=Gerostathopoulos%2C+I">Ilias Gerostathopoulos</a>, <a href="/search/cs?searchtype=author&query=Buhnova%2C+B">Barbora Buhnova</a>, <a href="/search/cs?searchtype=author&query=Cardozo%2C+N">Nicolas Cardozo</a>, <a href="/search/cs?searchtype=author&query=Cioroaica%2C+E">Emilia Cioroaica</a>, <a href="/search/cs?searchtype=author&query=Dusparic%2C+I">Ivana Dusparic</a>, <a href="/search/cs?searchtype=author&query=Grunske%2C+L">Lars Grunske</a>, <a href="/search/cs?searchtype=author&query=Jamshidi%2C+P">Pooyan Jamshidi</a>, <a href="/search/cs?searchtype=author&query=Julien%2C+C">Christine Julien</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Judith Michael</a>, <a href="/search/cs?searchtype=author&query=Moreno%2C+G">Gabriel Moreno</a>, <a href="/search/cs?searchtype=author&query=Nejati%2C+S">Shiva Nejati</a>, <a href="/search/cs?searchtype=author&query=Pelliccione%2C+P">Patrizio Pelliccione</a>, <a href="/search/cs?searchtype=author&query=Quin%2C+F">Federico Quin</a>, <a href="/search/cs?searchtype=author&query=Rodrigues%2C+G">Genaina Rodrigues</a>, <a href="/search/cs?searchtype=author&query=Schmerl%2C+B">Bradley Schmerl</a>, <a href="/search/cs?searchtype=author&query=Vieira%2C+M">Marco Vieira</a>, <a href="/search/cs?searchtype=author&query=Vogel%2C+T">Thomas Vogel</a>, <a href="/search/cs?searchtype=author&query=Wohlrab%2C+R">Rebekka Wohlrab</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.12492v1-abstract-short" style="display: inline;"> Artifacts support evaluating new research results and help comparing them with the state of the art in a field of interest. Over the past years, several artifacts have been introduced to support research in the field of self-adaptive systems. While these artifacts have shown their value, it is not clear to what extent these artifacts support research on problems in self-adaptation that are relevan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.12492v1-abstract-full').style.display = 'inline'; document.getElementById('2206.12492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.12492v1-abstract-full" style="display: none;"> Artifacts support evaluating new research results and help comparing them with the state of the art in a field of interest. Over the past years, several artifacts have been introduced to support research in the field of self-adaptive systems. While these artifacts have shown their value, it is not clear to what extent these artifacts support research on problems in self-adaptation that are relevant to industry. This paper provides a set of guidelines for artifacts that aim at supporting industry-relevant research on self-adaptation. The guidelines that are grounded on data obtained from a survey with practitioners were derived during working sessions at the 17th International Symposium on Software Engineering for Adaptive and Self-Managing Systems. Artifact providers can use the guidelines for aligning future artifacts with industry needs; they can also be used to evaluate the industrial relevance of existing artifacts. We also propose an artifact template. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.12492v1-abstract-full').style.display = 'none'; document.getElementById('2206.12492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.13792">arXiv:2205.13792</a> <span> [<a href="https://arxiv.org/pdf/2205.13792">pdf</a>, <a href="https://arxiv.org/format/2205.13792">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> kNN-Prompt: Nearest Neighbor Zero-Shot Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Gururangan%2C+S">Suchin Gururangan</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.13792v2-abstract-short" style="display: inline;"> Retrieval-augmented language models (LMs) use non-parametric memory to substantially outperform their non-retrieval counterparts on perplexity-based evaluations, but it is an open question whether they achieve similar gains in few- and zero-shot end-task accuracy. We extensively study one such model, the k-nearest neighbor LM (kNN-LM), showing that the gains marginally transfer. The main challenge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.13792v2-abstract-full').style.display = 'inline'; document.getElementById('2205.13792v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.13792v2-abstract-full" style="display: none;"> Retrieval-augmented language models (LMs) use non-parametric memory to substantially outperform their non-retrieval counterparts on perplexity-based evaluations, but it is an open question whether they achieve similar gains in few- and zero-shot end-task accuracy. We extensively study one such model, the k-nearest neighbor LM (kNN-LM), showing that the gains marginally transfer. The main challenge is to achieve coverage of the verbalizer tokens that define the different end-task class labels. To address this challenge, we also introduce kNN-Prompt, a simple and effective kNN-LM with automatically expanded fuzzy verbalizers (e.g. to expand terrible to also include silly and other task-specific synonyms for sentiment classification). Across nine diverse end-tasks, using kNN-Prompt with GPT-2 large yields significant performance boosts over strong zero-shot baselines (13.4% absolute improvement over the base LM on average). We also show that other advantages of non-parametric augmentation hold for end tasks; kNN-Prompt is effective for domain adaptation with no further training, and gains increase with the size of the retrieval model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.13792v2-abstract-full').style.display = 'none'; document.getElementById('2205.13792v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.07027">arXiv:2110.07027</a> <span> [<a href="https://arxiv.org/pdf/2110.07027">pdf</a>, <a href="https://arxiv.org/format/2110.07027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Comparison of SVD and factorized TDNN approaches for speech to text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Michael%2C+J+J">Jeffrey Josanne Michael</a>, <a href="/search/cs?searchtype=author&query=Goel%2C+N+K">Nagendra Kumar Goel</a>, <a href="/search/cs?searchtype=author&query=K%2C+N">Navneeth K</a>, <a href="/search/cs?searchtype=author&query=Robertson%2C+J">Jonas Robertson</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+S">Shravan Mishra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.07027v1-abstract-short" style="display: inline;"> This work concentrates on reducing the RTF and word error rate of a hybrid HMM-DNN. Our baseline system uses an architecture with TDNN and LSTM layers. We find this architecture particularly useful for lightly reverberated environments. However, these models tend to demand more computation than is desirable. In this work, we explore alternate architectures employing singular value decomposition (S… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.07027v1-abstract-full').style.display = 'inline'; document.getElementById('2110.07027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.07027v1-abstract-full" style="display: none;"> This work concentrates on reducing the RTF and word error rate of a hybrid HMM-DNN. Our baseline system uses an architecture with TDNN and LSTM layers. We find this architecture particularly useful for lightly reverberated environments. However, these models tend to demand more computation than is desirable. In this work, we explore alternate architectures employing singular value decomposition (SVD) is applied to the TDNN layers to reduce the RTF, as well as to the affine transforms of every LSTM cell. We compare this approach with specifying bottleneck layers similar to those introduced by SVD before training. Additionally, we reduced the search space of the decoding graph to make it a better fit to operate in real-time applications. We report -61.57% relative reduction in RTF and almost 1% relative decrease in WER for our architecture trained on Fisher data along with reverberated versions of this dataset in order to match one of our target test distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.07027v1-abstract-full').style.display = 'none'; document.getElementById('2110.07027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 1 figure, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.04832">arXiv:2109.04832</a> <span> [<a href="https://arxiv.org/pdf/2109.04832">pdf</a>, <a href="https://arxiv.org/format/2109.04832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Asking It All: Generating Contextualized Questions for any Semantic Role </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pyatkin%2C+V">Valentina Pyatkin</a>, <a href="/search/cs?searchtype=author&query=Roit%2C+P">Paul Roit</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Tsarfaty%2C+R">Reut Tsarfaty</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+Y">Yoav Goldberg</a>, <a href="/search/cs?searchtype=author&query=Dagan%2C+I">Ido Dagan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.04832v1-abstract-short" style="display: inline;"> Asking questions about a situation is an inherent step towards understanding it. To this end, we introduce the task of role question generation, which, given a predicate mention and a passage, requires producing a set of questions asking about all possible semantic roles of the predicate. We develop a two-stage model for this task, which first produces a context-independent question prototype for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.04832v1-abstract-full').style.display = 'inline'; document.getElementById('2109.04832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.04832v1-abstract-full" style="display: none;"> Asking questions about a situation is an inherent step towards understanding it. To this end, we introduce the task of role question generation, which, given a predicate mention and a passage, requires producing a set of questions asking about all possible semantic roles of the predicate. We develop a two-stage model for this task, which first produces a context-independent question prototype for each role and then revises it to be contextually appropriate for the passage. Unlike most existing approaches to question generation, our approach does not require conditioning on existing answers in the text. Instead, we condition on the type of information to inquire about, regardless of whether the answer appears explicitly in the text, could be inferred from it, or should be sought elsewhere. Our evaluation demonstrates that we generate diverse and well-formed questions for a large, broad-coverage ontology of predicates and roles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.04832v1-abstract-full').style.display = 'none'; document.getElementById('2109.04832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a long paper to EMNLP 2021, Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06823">arXiv:2106.06823</a> <span> [<a href="https://arxiv.org/pdf/2106.06823">pdf</a>, <a href="https://arxiv.org/format/2106.06823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Prompting Contrastive Explanations for Commonsense Reasoning Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Paranjape%2C+B">Bhargavi Paranjape</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Ghazvininejad%2C+M">Marjan Ghazvininejad</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06823v1-abstract-short" style="display: inline;"> Many commonsense reasoning NLP tasks involve choosing between one or more possible answers to a question or prompt based on knowledge that is often implicit. Large pretrained language models (PLMs) can achieve near-human performance on such tasks, while providing little human-interpretable evidence of the underlying reasoning they use. In this work, we show how to use these same models to generate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06823v1-abstract-full').style.display = 'inline'; document.getElementById('2106.06823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06823v1-abstract-full" style="display: none;"> Many commonsense reasoning NLP tasks involve choosing between one or more possible answers to a question or prompt based on knowledge that is often implicit. Large pretrained language models (PLMs) can achieve near-human performance on such tasks, while providing little human-interpretable evidence of the underlying reasoning they use. In this work, we show how to use these same models to generate such evidence: inspired by the contrastive nature of human explanations, we use PLMs to complete explanation prompts which contrast alternatives according to the key attribute(s) required to justify the correct answer (for example, peanuts are usually salty while raisins are sweet). Conditioning model decisions on these explanations improves performance on two commonsense reasoning benchmarks, as compared to previous non-contrastive alternatives. These explanations are also judged by humans to be more relevant for solving the task, and facilitate a novel method to evaluate explanation faithfulfness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06823v1-abstract-full').style.display = 'none'; document.getElementById('2106.06823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2021 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.14255">arXiv:2006.14255</a> <span> [<a href="https://arxiv.org/pdf/2006.14255">pdf</a>, <a href="https://arxiv.org/format/2006.14255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SS-CAM: Smoothed Score-CAM for Sharper Visual Feature Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haofan Wang</a>, <a href="/search/cs?searchtype=author&query=Naidu%2C+R">Rakshit Naidu</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Joy Michael</a>, <a href="/search/cs?searchtype=author&query=Kundu%2C+S+S">Soumya Snigdha Kundu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.14255v3-abstract-short" style="display: inline;"> Interpretation of the underlying mechanisms of Deep Convolutional Neural Networks has become an important aspect of research in the field of deep learning due to their applications in high-risk environments. To explain these black-box architectures there have been many methods applied so the internal decisions can be analyzed and understood. In this paper, built on the top of Score-CAM, we introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.14255v3-abstract-full').style.display = 'inline'; document.getElementById('2006.14255v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.14255v3-abstract-full" style="display: none;"> Interpretation of the underlying mechanisms of Deep Convolutional Neural Networks has become an important aspect of research in the field of deep learning due to their applications in high-risk environments. To explain these black-box architectures there have been many methods applied so the internal decisions can be analyzed and understood. In this paper, built on the top of Score-CAM, we introduce an enhanced visual explanation in terms of visual sharpness called SS-CAM, which produces centralized localization of object features within an image through a smooth operation. We evaluate our method on the ILSVRC 2012 Validation dataset, which outperforms Score-CAM on both faithfulness and localization tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.14255v3-abstract-full').style.display = 'none'; document.getElementById('2006.14255v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures and 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.14513">arXiv:2004.14513</a> <span> [<a href="https://arxiv.org/pdf/2004.14513">pdf</a>, <a href="https://arxiv.org/format/2004.14513">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Asking without Telling: Exploring Latent Ontologies in Contextual Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Botha%2C+J+A">Jan A. Botha</a>, <a href="/search/cs?searchtype=author&query=Tenney%2C+I">Ian Tenney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.14513v2-abstract-short" style="display: inline;"> The success of pretrained contextual encoders, such as ELMo and BERT, has brought a great deal of interest in what these models learn: do they, without explicit supervision, learn to encode meaningful notions of linguistic structure? If so, how is this structure encoded? To investigate this, we introduce latent subclass learning (LSL): a modification to existing classifier-based probing methods th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.14513v2-abstract-full').style.display = 'inline'; document.getElementById('2004.14513v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.14513v2-abstract-full" style="display: none;"> The success of pretrained contextual encoders, such as ELMo and BERT, has brought a great deal of interest in what these models learn: do they, without explicit supervision, learn to encode meaningful notions of linguistic structure? If so, how is this structure encoded? To investigate this, we introduce latent subclass learning (LSL): a modification to existing classifier-based probing methods that induces a latent categorization (or ontology) of the probe's inputs. Without access to fine-grained gold labels, LSL extracts emergent structure from input representations in an interpretable and quantifiable form. In experiments, we find strong evidence of familiar categories, such as a notion of personhood in ELMo, as well as novel ontological distinctions, such as a preference for fine-grained semantic roles on core arguments. Our results provide unique new evidence of emergent structure in pretrained encoders, including departures from existing annotations which are inaccessible to earlier methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.14513v2-abstract-full').style.display = 'none'; document.getElementById('2004.14513v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 8 figures, 11 tables. Published in EMNLP 2020</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.10645">arXiv:2004.10645</a> <span> [<a href="https://arxiv.org/pdf/2004.10645">pdf</a>, <a href="https://arxiv.org/format/2004.10645">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AmbigQA: Answering Ambiguous Open-domain Questions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.10645v2-abstract-short" style="display: inline;"> Ambiguity is inherent to open-domain question answering; especially when exploring new topics, it can be difficult to ask questions that have a single, unambiguous answer. In this paper, we introduce AmbigQA, a new open-domain question answering task which involves finding every plausible answer, and then rewriting the question for each one to resolve the ambiguity. To study this task, we construc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.10645v2-abstract-full').style.display = 'inline'; document.getElementById('2004.10645v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.10645v2-abstract-full" style="display: none;"> Ambiguity is inherent to open-domain question answering; especially when exploring new topics, it can be difficult to ask questions that have a single, unambiguous answer. In this paper, we introduce AmbigQA, a new open-domain question answering task which involves finding every plausible answer, and then rewriting the question for each one to resolve the ambiguity. To study this task, we construct AmbigNQ, a dataset covering 14,042 questions from NQ-open, an existing open-domain QA benchmark. We find that over half of the questions in NQ-open are ambiguous, with diverse sources of ambiguity such as event and entity references. We also present strong baseline models for AmbigQA which we show benefit from weakly supervised learning that incorporates NQ-open, strongly suggesting our new task and data will support significant future research effort. Our data and baselines are available at https://nlp.cs.washington.edu/ambigqa. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.10645v2-abstract-full').style.display = 'none'; document.getElementById('2004.10645v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at EMNLP 2020 (long)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.10365">arXiv:2003.10365</a> <span> [<a href="https://arxiv.org/pdf/2003.10365">pdf</a>, <a href="https://arxiv.org/format/2003.10365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On Interactive Machine Learning and the Potential of Cognitive Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Michael%2C+C+J">Chris J. Michael</a>, <a href="/search/cs?searchtype=author&query=Acklin%2C+D">Dina Acklin</a>, <a href="/search/cs?searchtype=author&query=Scheuerman%2C+J">Jaelle Scheuerman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.10365v1-abstract-short" style="display: inline;"> In order to increase productivity, capability, and data exploitation, numerous defense applications are experiencing an integration of state-of-the-art machine learning and AI into their architectures. Especially for defense applications, having a human analyst in the loop is of high interest due to quality control, accountability, and complex subject matter expertise not readily automated or repl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.10365v1-abstract-full').style.display = 'inline'; document.getElementById('2003.10365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.10365v1-abstract-full" style="display: none;"> In order to increase productivity, capability, and data exploitation, numerous defense applications are experiencing an integration of state-of-the-art machine learning and AI into their architectures. Especially for defense applications, having a human analyst in the loop is of high interest due to quality control, accountability, and complex subject matter expertise not readily automated or replicated by AI. However, many applications are suffering from a very slow transition. This may be in large part due to lack of trust, usability, and productivity, especially when adapting to unforeseen classes and changes in mission context. Interactive machine learning is a newly emerging field in which machine learning implementations are trained, optimized, evaluated, and exploited through an intuitive human-computer interface. In this paper, we introduce interactive machine learning and explain its advantages and limitations within the context of defense applications. Furthermore, we address several of the shortcomings of interactive machine learning by discussing how cognitive feedback may inform features, data, and results in the state of the art. We define the three techniques by which cognitive feedback may be employed: self reporting, implicit cognitive feedback, and modeled cognitive feedback. The advantages and disadvantages of each technique are discussed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.10365v1-abstract-full').style.display = 'none'; document.getElementById('2003.10365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 2 figures, submitted and accepted to the 2nd Workshop on Deep Models and Artificial Intelligence for Defense Applications: Potentials, Theories, Practices, Tools and Risks sponsored by the Association for the Advancement of Artificial Intelligence in cooperation with the Stanford University Computer Science Department</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.03243">arXiv:1911.03243</a> <span> [<a href="https://arxiv.org/pdf/1911.03243">pdf</a>, <a href="https://arxiv.org/ps/1911.03243">ps</a>, <a href="https://arxiv.org/format/1911.03243">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Controlled Crowdsourcing for High-Quality QA-SRL Annotation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Roit%2C+P">Paul Roit</a>, <a href="/search/cs?searchtype=author&query=Klein%2C+A">Ayal Klein</a>, <a href="/search/cs?searchtype=author&query=Stepanov%2C+D">Daniela Stepanov</a>, <a href="/search/cs?searchtype=author&query=Mamou%2C+J">Jonathan Mamou</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Stanovsky%2C+G">Gabriel Stanovsky</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Dagan%2C+I">Ido Dagan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.03243v2-abstract-short" style="display: inline;"> Question-answer driven Semantic Role Labeling (QA-SRL) was proposed as an attractive open and natural flavour of SRL, potentially attainable from laymen. Recently, a large-scale crowdsourced QA-SRL corpus and a trained parser were released. Trying to replicate the QA-SRL annotation for new texts, we found that the resulting annotations were lacking in quality, particularly in coverage, making them… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.03243v2-abstract-full').style.display = 'inline'; document.getElementById('1911.03243v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.03243v2-abstract-full" style="display: none;"> Question-answer driven Semantic Role Labeling (QA-SRL) was proposed as an attractive open and natural flavour of SRL, potentially attainable from laymen. Recently, a large-scale crowdsourced QA-SRL corpus and a trained parser were released. Trying to replicate the QA-SRL annotation for new texts, we found that the resulting annotations were lacking in quality, particularly in coverage, making them insufficient for further research and evaluation. In this paper, we present an improved crowdsourcing protocol for complex semantic annotation, involving worker selection and training, and a data consolidation phase. Applying this protocol to QA-SRL yielded high-quality annotation with drastically higher coverage, producing a new gold evaluation dataset. We believe that our annotation protocol and gold standard will facilitate future replicable research of natural semantic annotations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.03243v2-abstract-full').style.display = 'none'; document.getElementById('1911.03243v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.00537">arXiv:1905.00537</a> <span> [<a href="https://arxiv.org/pdf/1905.00537">pdf</a>, <a href="https://arxiv.org/format/1905.00537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+A">Alex Wang</a>, <a href="/search/cs?searchtype=author&query=Pruksachatkun%2C+Y">Yada Pruksachatkun</a>, <a href="/search/cs?searchtype=author&query=Nangia%2C+N">Nikita Nangia</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Amanpreet Singh</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Hill%2C+F">Felix Hill</a>, <a href="/search/cs?searchtype=author&query=Levy%2C+O">Omer Levy</a>, <a href="/search/cs?searchtype=author&query=Bowman%2C+S+R">Samuel R. Bowman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.00537v3-abstract-short" style="display: inline;"> In the last year, new models and methods for pretraining and transfer learning have driven striking performance improvements across a range of language understanding tasks. The GLUE benchmark, introduced a little over one year ago, offers a single-number metric that summarizes progress on a diverse set of such tasks, but performance on the benchmark has recently surpassed the level of non-expert h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.00537v3-abstract-full').style.display = 'inline'; document.getElementById('1905.00537v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.00537v3-abstract-full" style="display: none;"> In the last year, new models and methods for pretraining and transfer learning have driven striking performance improvements across a range of language understanding tasks. The GLUE benchmark, introduced a little over one year ago, offers a single-number metric that summarizes progress on a diverse set of such tasks, but performance on the benchmark has recently surpassed the level of non-expert humans, suggesting limited headroom for further research. In this paper we present SuperGLUE, a new benchmark styled after GLUE with a new set of more difficult language understanding tasks, a software toolkit, and a public leaderboard. SuperGLUE is available at super.gluebenchmark.com. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.00537v3-abstract-full').style.display = 'none'; document.getElementById('1905.00537v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2019, super.gluebenchmark.com updating acknowledegments</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.07377">arXiv:1903.07377</a> <span> [<a href="https://arxiv.org/pdf/1903.07377">pdf</a>, <a href="https://arxiv.org/format/1903.07377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Sequence-to-Sequence Models for Handwritten Text Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Michael%2C+J">Johannes Michael</a>, <a href="/search/cs?searchtype=author&query=Labahn%2C+R">Roger Labahn</a>, <a href="/search/cs?searchtype=author&query=Gr%C3%BCning%2C+T">Tobias Gr眉ning</a>, <a href="/search/cs?searchtype=author&query=Z%C3%B6llner%2C+J">Jochen Z枚llner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.07377v2-abstract-short" style="display: inline;"> Encoder-decoder models have become an effective approach for sequence learning tasks like machine translation, image captioning and speech recognition, but have yet to show competitive results for handwritten text recognition. To this end, we propose an attention-based sequence-to-sequence model. It combines a convolutional neural network as a generic feature extractor with a recurrent neural netw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.07377v2-abstract-full').style.display = 'inline'; document.getElementById('1903.07377v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.07377v2-abstract-full" style="display: none;"> Encoder-decoder models have become an effective approach for sequence learning tasks like machine translation, image captioning and speech recognition, but have yet to show competitive results for handwritten text recognition. To this end, we propose an attention-based sequence-to-sequence model. It combines a convolutional neural network as a generic feature extractor with a recurrent neural network to encode both the visual information, as well as the temporal context between characters in the input image, and uses a separate recurrent neural network to decode the actual character sequence. We make experimental comparisons between various attention mechanisms and positional encodings, in order to find an appropriate alignment between the input and output sequence. The model can be trained end-to-end and the optional integration of a hybrid loss allows the encoder to retain an interpretable and usable output, if desired. We achieve competitive results on the IAM and ICFHR2016 READ data sets compared to the state-of-the-art without the use of a language model, and we significantly improve over any recent sequence-to-sequence approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.07377v2-abstract-full').style.display = 'none'; document.getElementById('1903.07377v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 1 figure, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1807.06270">arXiv:1807.06270</a> <span> [<a href="https://arxiv.org/pdf/1807.06270">pdf</a>, <a href="https://arxiv.org/format/1807.06270">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Bench-Marking Information Extraction in Semi-Structured Historical Handwritten Records </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Prasad%2C+A">Animesh Prasad</a>, <a href="/search/cs?searchtype=author&query=D%C3%A9jean%2C+H">Herv茅 D茅jean</a>, <a href="/search/cs?searchtype=author&query=Meunier%2C+J">Jean-Luc Meunier</a>, <a href="/search/cs?searchtype=author&query=Weidemann%2C+M">Max Weidemann</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Johannes Michael</a>, <a href="/search/cs?searchtype=author&query=Leifert%2C+G">Gundram Leifert</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1807.06270v1-abstract-short" style="display: inline;"> In this report, we present our findings from benchmarking experiments for information extraction on historical handwritten marriage records Esposalles from IEHHR - ICDAR 2017 robust reading competition. The information extraction is modeled as semantic labeling of the sequence across 2 set of labels. This can be achieved by sequentially or jointly applying handwritten text recognition (HTR) and na… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.06270v1-abstract-full').style.display = 'inline'; document.getElementById('1807.06270v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1807.06270v1-abstract-full" style="display: none;"> In this report, we present our findings from benchmarking experiments for information extraction on historical handwritten marriage records Esposalles from IEHHR - ICDAR 2017 robust reading competition. The information extraction is modeled as semantic labeling of the sequence across 2 set of labels. This can be achieved by sequentially or jointly applying handwritten text recognition (HTR) and named entity recognition (NER). We deploy a pipeline approach where first we use state-of-the-art HTR and use its output as input for NER. We show that given low resource setup and simple structure of the records, high performance of HTR ensures overall high performance. We explore the various configurations of conditional random fields and neural networks to benchmark NER on given certain noisy input. The best model on 10-fold cross-validation as well as blind test data uses n-gram features with bidirectional long short-term memory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.06270v1-abstract-full').style.display = 'none'; document.getElementById('1807.06270v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1805.05377">arXiv:1805.05377</a> <span> [<a href="https://arxiv.org/pdf/1805.05377">pdf</a>, <a href="https://arxiv.org/format/1805.05377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Large-Scale QA-SRL Parsing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=FitzGerald%2C+N">Nicholas FitzGerald</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Luheng He</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1805.05377v1-abstract-short" style="display: inline;"> We present a new large-scale corpus of Question-Answer driven Semantic Role Labeling (QA-SRL) annotations, and the first high-quality QA-SRL parser. Our corpus, QA-SRL Bank 2.0, consists of over 250,000 question-answer pairs for over 64,000 sentences across 3 domains and was gathered with a new crowd-sourcing scheme that we show has high precision and good recall at modest cost. We also present ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.05377v1-abstract-full').style.display = 'inline'; document.getElementById('1805.05377v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1805.05377v1-abstract-full" style="display: none;"> We present a new large-scale corpus of Question-Answer driven Semantic Role Labeling (QA-SRL) annotations, and the first high-quality QA-SRL parser. Our corpus, QA-SRL Bank 2.0, consists of over 250,000 question-answer pairs for over 64,000 sentences across 3 domains and was gathered with a new crowd-sourcing scheme that we show has high precision and good recall at modest cost. We also present neural models for two QA-SRL subtasks: detecting argument spans for a predicate and generating questions to label the semantic relationship. The best models achieve question accuracy of 82.6% and span-level accuracy of 77.6% (under human evaluation) on the full pipelined QA-SRL prediction task. They can also, as we show, be used to gather additional annotations at low cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.05377v1-abstract-full').style.display = 'none'; document.getElementById('1805.05377v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 May, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 3 figures, 8 tables. Accepted to ACL 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.09943">arXiv:1804.09943</a> <span> [<a href="https://arxiv.org/pdf/1804.09943">pdf</a>, <a href="https://arxiv.org/format/1804.09943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> System Description of CITlab's Recognition & Retrieval Engine for ICDAR2017 Competition on Information Extraction in Historical Handwritten Records </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Strau%C3%9F%2C+T">Tobias Strau脽</a>, <a href="/search/cs?searchtype=author&query=Weidemann%2C+M">Max Weidemann</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Johannes Michael</a>, <a href="/search/cs?searchtype=author&query=Leifert%2C+G">Gundram Leifert</a>, <a href="/search/cs?searchtype=author&query=Gr%C3%BCning%2C+T">Tobias Gr眉ning</a>, <a href="/search/cs?searchtype=author&query=Labahn%2C+R">Roger Labahn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.09943v1-abstract-short" style="display: inline;"> We present a recognition and retrieval system for the ICDAR2017 Competition on Information Extraction in Historical Handwritten Records which successfully infers person names and other data from marriage records. The system extracts information from the line images with a high accuracy and outperforms the baseline. The optical model is based on Neural Networks. To infer the desired information, re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.09943v1-abstract-full').style.display = 'inline'; document.getElementById('1804.09943v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.09943v1-abstract-full" style="display: none;"> We present a recognition and retrieval system for the ICDAR2017 Competition on Information Extraction in Historical Handwritten Records which successfully infers person names and other data from marriage records. The system extracts information from the line images with a high accuracy and outperforms the baseline. The optical model is based on Neural Networks. To infer the desired information, regular expressions are used to describe the set of feasible words sequences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.09943v1-abstract-full').style.display = 'none'; document.getElementById('1804.09943v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.07461">arXiv:1804.07461</a> <span> [<a href="https://arxiv.org/pdf/1804.07461">pdf</a>, <a href="https://arxiv.org/format/1804.07461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+A">Alex Wang</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Amanpreet Singh</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Hill%2C+F">Felix Hill</a>, <a href="/search/cs?searchtype=author&query=Levy%2C+O">Omer Levy</a>, <a href="/search/cs?searchtype=author&query=Bowman%2C+S+R">Samuel R. Bowman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.07461v3-abstract-short" style="display: inline;"> For natural language understanding (NLU) technology to be maximally useful, both practically and as a scientific object of study, it must be general: it must be able to process language in a way that is not exclusively tailored to any one specific task or dataset. In pursuit of this objective, we introduce the General Language Understanding Evaluation benchmark (GLUE), a tool for evaluating and an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.07461v3-abstract-full').style.display = 'inline'; document.getElementById('1804.07461v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.07461v3-abstract-full" style="display: none;"> For natural language understanding (NLU) technology to be maximally useful, both practically and as a scientific object of study, it must be general: it must be able to process language in a way that is not exclusively tailored to any one specific task or dataset. In pursuit of this objective, we introduce the General Language Understanding Evaluation benchmark (GLUE), a tool for evaluating and analyzing the performance of models across a diverse range of existing NLU tasks. GLUE is model-agnostic, but it incentivizes sharing knowledge across tasks because certain tasks have very limited training data. We further provide a hand-crafted diagnostic test suite that enables detailed linguistic analysis of NLU models. We evaluate baselines based on current methods for multi-task and transfer learning and find that they do not immediately give substantial improvements over the aggregate performance of training a separate model per task, indicating room for improvement in developing general and robust NLU systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.07461v3-abstract-full').style.display = 'none'; document.getElementById('1804.07461v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2019; https://gluebenchmark.com/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1802.03345">arXiv:1802.03345</a> <span> [<a href="https://arxiv.org/pdf/1802.03345">pdf</a>, <a href="https://arxiv.org/format/1802.03345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s10032-019-00332-1">10.1007/s10032-019-00332-1 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Two-Stage Method for Text Line Detection in Historical Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gr%C3%BCning%2C+T">Tobias Gr眉ning</a>, <a href="/search/cs?searchtype=author&query=Leifert%2C+G">Gundram Leifert</a>, <a href="/search/cs?searchtype=author&query=Strau%C3%9F%2C+T">Tobias Strau脽</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Johannes Michael</a>, <a href="/search/cs?searchtype=author&query=Labahn%2C+R">Roger Labahn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1802.03345v2-abstract-short" style="display: inline;"> This work presents a two-stage text line detection method for historical documents. Each detected text line is represented by its baseline. In a first stage, a deep neural network called ARU-Net labels pixels to belong to one of the three classes: baseline, separator or other. The separator class marks beginning and end of each text line. The ARU-Net is trainable from scratch with manageably few m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.03345v2-abstract-full').style.display = 'inline'; document.getElementById('1802.03345v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1802.03345v2-abstract-full" style="display: none;"> This work presents a two-stage text line detection method for historical documents. Each detected text line is represented by its baseline. In a first stage, a deep neural network called ARU-Net labels pixels to belong to one of the three classes: baseline, separator or other. The separator class marks beginning and end of each text line. The ARU-Net is trainable from scratch with manageably few manually annotated example images (less than 50). This is achieved by utilizing data augmentation strategies. The network predictions are used as input for the second stage which performs a bottom-up clustering to build baselines. The developed method is capable of handling complex layouts as well as curved and arbitrarily oriented text lines. It substantially outperforms current state-of-the-art approaches. For example, for the complex track of the cBAD: ICDAR2017 Competition on Baseline Detection the F-value is increased from 0.859 to 0.922. The framework to train and run the ARU-Net is open source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.03345v2-abstract-full').style.display = 'none'; document.getElementById('1802.03345v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to be published in IJDAR</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Journal on Document Analysis and Recognition (IJDAR), (2019), 1-18 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1711.05885">arXiv:1711.05885</a> <span> [<a href="https://arxiv.org/pdf/1711.05885">pdf</a>, <a href="https://arxiv.org/format/1711.05885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Crowdsourcing Question-Answer Meaning Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Stanovsky%2C+G">Gabriel Stanovsky</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Luheng He</a>, <a href="/search/cs?searchtype=author&query=Dagan%2C+I">Ido Dagan</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1711.05885v1-abstract-short" style="display: inline;"> We introduce Question-Answer Meaning Representations (QAMRs), which represent the predicate-argument structure of a sentence as a set of question-answer pairs. We also develop a crowdsourcing scheme to show that QAMRs can be labeled with very little training, and gather a dataset with over 5,000 sentences and 100,000 questions. A detailed qualitative analysis demonstrates that the crowd-generated… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.05885v1-abstract-full').style.display = 'inline'; document.getElementById('1711.05885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1711.05885v1-abstract-full" style="display: none;"> We introduce Question-Answer Meaning Representations (QAMRs), which represent the predicate-argument structure of a sentence as a set of question-answer pairs. We also develop a crowdsourcing scheme to show that QAMRs can be labeled with very little training, and gather a dataset with over 5,000 sentences and 100,000 questions. A detailed qualitative analysis demonstrates that the crowd-generated question-answer pairs cover the vast majority of predicate-argument relationships in existing datasets (including PropBank, NomBank, QA-SRL, and AMR) along with many previously under-resourced ones, including implicit arguments and relations. The QAMR data and annotation code is made publicly available to enable future work on how best to model these complex phenomena. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.05885v1-abstract-full').style.display = 'none'; document.getElementById('1711.05885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1608.01626">arXiv:1608.01626</a> <span> [<a href="https://arxiv.org/pdf/1608.01626">pdf</a>, <a href="https://arxiv.org/ps/1608.01626">ps</a>, <a href="https://arxiv.org/format/1608.01626">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> Proving Infinitary Formulas </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Harrison%2C+A">Amelia Harrison</a>, <a href="/search/cs?searchtype=author&query=Lifschitz%2C+V">Vladimir Lifschitz</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1608.01626v1-abstract-short" style="display: inline;"> The infinitary propositional logic of here-and-there is important for the theory of answer set programming in view of its relation to strongly equivalent transformations of logic programs. We know a formal system axiomatizing this logic exists, but a proof in that system may include infinitely many formulas. In this note we describe a relationship between the validity of infinitary formulas in the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1608.01626v1-abstract-full').style.display = 'inline'; document.getElementById('1608.01626v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1608.01626v1-abstract-full" style="display: none;"> The infinitary propositional logic of here-and-there is important for the theory of answer set programming in view of its relation to strongly equivalent transformations of logic programs. We know a formal system axiomatizing this logic exists, but a proof in that system may include infinitely many formulas. In this note we describe a relationship between the validity of infinitary formulas in the logic of here-and-there and the provability of formulas in some finite deductive systems. This relationship allows us to use finite proofs to justify the validity of infinitary formulas. This note is under consideration for publication in Theory and Practice of Logic Programming. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1608.01626v1-abstract-full').style.display = 'none'; document.getElementById('1608.01626v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper presented at the 32nd International Conference on Logic Programming (ICLP 2016), New York City, USA, 16-21 October 2016, 15 pages, LaTeX, 3 PDF figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1302.4000">arXiv:1302.4000</a> <span> [<a href="https://arxiv.org/pdf/1302.4000">pdf</a>, <a href="https://arxiv.org/ps/1302.4000">ps</a>, <a href="https://arxiv.org/format/1302.4000">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1186/1471-2105-14-62">10.1186/1471-2105-14-62 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> ClusCo: clustering and comparison of protein models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Micha%C5%82%2C+J">Jamr贸z Micha艂</a>, <a href="/search/cs?searchtype=author&query=Andrzej%2C+K">Koli艅ski Andrzej</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1302.4000v2-abstract-short" style="display: inline;"> Background: The development, optimization and validation of protein modeling methods require efficient tools for structural comparison. Frequently, a large number of models need to be compared with the target native structure. The main reason for the development of Clusco software was to create a high-throughput tool for all-versus-all comparison, because calculating similarity matrix is the one o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1302.4000v2-abstract-full').style.display = 'inline'; document.getElementById('1302.4000v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1302.4000v2-abstract-full" style="display: none;"> Background: The development, optimization and validation of protein modeling methods require efficient tools for structural comparison. Frequently, a large number of models need to be compared with the target native structure. The main reason for the development of Clusco software was to create a high-throughput tool for all-versus-all comparison, because calculating similarity matrix is the one of the bottlenecks in the protein modeling pipeline. Results: Clusco is fast and easy-to-use software for high-throughput comparison of protein models with different similarity measures (cRMSD, dRMSD, GDT_TS, TM-Score, MaxSub, Contact Map Overlap) and clustering of the comparison results with standard methods: K-means Clustering or Hierarchical Agglomerative Clustering. Conclusions: The application was highly optimized and written in C/C++, including the code for parallel execution on CPU and GPU version of cRMSD, which resulted in a significant speedup over similar clustering and scoring computation programs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1302.4000v2-abstract-full').style.display = 'none'; document.getElementById('1302.4000v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2013; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2013; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2013. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1004.5512">arXiv:1004.5512</a> <span> [<a href="https://arxiv.org/pdf/1004.5512">pdf</a>, <a href="https://arxiv.org/ps/1004.5512">ps</a>, <a href="https://arxiv.org/format/1004.5512">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Security Estimates for Quadratic Field Based Cryptosystems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Biasse%2C+J">Jean-Fran莽ois Biasse</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J+J">Jacobson John Michael</a>, <a href="/search/cs?searchtype=author&query=Alan%2C+S+K">Silverster K. Alan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1004.5512v1-abstract-short" style="display: inline;"> We describe implementations for solving the discrete logarithm problem in the class group of an imaginary quadratic field and in the infrastructure of a real quadratic field. The algorithms used incorporate improvements over previously-used algorithms, and extensive numerical results are presented demonstrating their efficiency. This data is used as the basis for extrapolations, used to provide re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1004.5512v1-abstract-full').style.display = 'inline'; document.getElementById('1004.5512v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1004.5512v1-abstract-full" style="display: none;"> We describe implementations for solving the discrete logarithm problem in the class group of an imaginary quadratic field and in the infrastructure of a real quadratic field. The algorithms used incorporate improvements over previously-used algorithms, and extensive numerical results are presented demonstrating their efficiency. This data is used as the basis for extrapolations, used to provide recommendations for parameter sizes providing approximately the same level of security as block ciphers with $80,$ $112,$ $128,$ $192,$ and $256$-bit symmetric keys. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1004.5512v1-abstract-full').style.display = 'none'; document.getElementById('1004.5512v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2010; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2010. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Lecture notes in computer science (2010) </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>