CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;39 of 39 results for author: <span class="mathjax">Aharoni, R</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Aharoni%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Aharoni, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Aharoni%2C+R&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Aharoni, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15466">arXiv:2410.15466</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15466">pdf</a>, <a href="https://arxiv.org/format/2410.15466">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Keep Guessing? When Considering Inference Scaling, Mind the Baselines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yona%2C+G">Gal Yona</a>, <a href="/search/cs?searchtype=author&amp;query=Honovich%2C+O">Or Honovich</a>, <a href="/search/cs?searchtype=author&amp;query=Levy%2C+O">Omer Levy</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15466v1-abstract-short" style="display: inline;"> Scaling inference compute in large language models (LLMs) through repeated sampling consistently increases the coverage (fraction of problems solved) as the number of samples increases. We conjecture that this observed improvement is partially due to the answer distribution of standard evaluation benchmarks, which is skewed towards a relatively small set of common answers. To test this conjecture,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15466v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15466v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15466v1-abstract-full" style="display: none;"> Scaling inference compute in large language models (LLMs) through repeated sampling consistently increases the coverage (fraction of problems solved) as the number of samples increases. We conjecture that this observed improvement is partially due to the answer distribution of standard evaluation benchmarks, which is skewed towards a relatively small set of common answers. To test this conjecture, we define a baseline that enumerates answers according to their prevalence in the training set. Experiments spanning two domains -- mathematical reasoning and factual knowledge -- reveal that this baseline outperforms repeated model sampling for some LLMs, while the coverage for others is on par with that of a mixture strategy that obtains $k$ answers by using only $10$ model samples and similarly guessing the remaining $k-10$ attempts via enumeration. Our baseline enables a more accurate measurement of how much repeated sampling improves coverage in such settings beyond prompt-agnostic guessing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15466v1-abstract-full').style.display = 'none'; document.getElementById('2410.15466v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07473">arXiv:2410.07473</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07473">pdf</a>, <a href="https://arxiv.org/format/2410.07473">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Localizing Factual Inconsistencies in Attributable Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cattan%2C+A">Arie Cattan</a>, <a href="/search/cs?searchtype=author&amp;query=Roit%2C+P">Paul Roit</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shiyue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+D">David Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&amp;query=Bansal%2C+M">Mohit Bansal</a>, <a href="/search/cs?searchtype=author&amp;query=Dagan%2C+I">Ido Dagan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07473v1-abstract-short" style="display: inline;"> There has been an increasing interest in detecting hallucinations in model-generated texts, both manually and automatically, at varying levels of granularity. However, most existing methods fail to precisely pinpoint the errors. In this work, we introduce QASemConsistency, a new formalism for localizing factual inconsistencies in attributable text generation, at a fine-grained level. Drawing inspi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07473v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07473v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07473v1-abstract-full" style="display: none;"> There has been an increasing interest in detecting hallucinations in model-generated texts, both manually and automatically, at varying levels of granularity. However, most existing methods fail to precisely pinpoint the errors. In this work, we introduce QASemConsistency, a new formalism for localizing factual inconsistencies in attributable text generation, at a fine-grained level. Drawing inspiration from Neo-Davidsonian formal semantics, we propose decomposing the generated text into minimal predicate-argument level propositions, expressed as simple question-answer (QA) pairs, and assess whether each individual QA pair is supported by a trusted reference text. As each QA pair corresponds to a single semantic relation between a predicate and an argument, QASemConsistency effectively localizes the unsupported information. We first demonstrate the effectiveness of the QASemConsistency methodology for human annotation, by collecting crowdsourced annotations of granular consistency errors, while achieving a substantial inter-annotator agreement ($魏&gt; 0.7)$. Then, we implement several methods for automatically detecting localized factual inconsistencies, with both supervised entailment models and open-source LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07473v1-abstract-full').style.display = 'none'; document.getElementById('2410.07473v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10646">arXiv:2408.10646</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10646">pdf</a>, <a href="https://arxiv.org/format/2408.10646">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Beneath the Surface of Consistency: Exploring Cross-lingual Knowledge Representation Sharing in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ifergan%2C+M">Maxim Ifergan</a>, <a href="/search/cs?searchtype=author&amp;query=Choshen%2C+L">Leshem Choshen</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&amp;query=Abend%2C+O">Omri Abend</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10646v1-abstract-short" style="display: inline;"> The veracity of a factoid is largely independent of the language it is written in. However, language models are inconsistent in their ability to answer the same factual question across languages. This raises questions about how LLMs represent a given fact across languages. We explore multilingual factual knowledge through two aspects: the model&#39;s ability to answer a query consistently across langu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10646v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10646v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10646v1-abstract-full" style="display: none;"> The veracity of a factoid is largely independent of the language it is written in. However, language models are inconsistent in their ability to answer the same factual question across languages. This raises questions about how LLMs represent a given fact across languages. We explore multilingual factual knowledge through two aspects: the model&#39;s ability to answer a query consistently across languages, and the ability to &#39;&#39;store&#39;&#39; answers in a shared representation for several languages. We propose a methodology to measure the extent of representation sharing across languages by repurposing knowledge editing methods. We examine LLMs with various multilingual configurations using a new multilingual dataset. We reveal that high consistency does not necessarily imply shared representation, particularly for languages with different scripts. Moreover, we find that script similarity is a dominant factor in representation sharing. Finally, we observe that if LLMs could fully share knowledge across languages, their accuracy in their best-performing language could benefit an increase of up to 150\% on average. These findings highlight the need for improved multilingual knowledge representation in LLMs and suggest a path for the development of more robust and consistent multilingual LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10646v1-abstract-full').style.display = 'none'; document.getElementById('2408.10646v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13632">arXiv:2406.13632</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13632">pdf</a>, <a href="https://arxiv.org/format/2406.13632">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can Few-shot Work in Long-Context? Recycling the Context to Generate Demonstrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cattan%2C+A">Arie Cattan</a>, <a href="/search/cs?searchtype=author&amp;query=Jacovi%2C+A">Alon Jacovi</a>, <a href="/search/cs?searchtype=author&amp;query=Fabrikant%2C+A">Alex Fabrikant</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Rashkin%2C+H">Hannah Rashkin</a>, <a href="/search/cs?searchtype=author&amp;query=Marcus%2C+D">Dror Marcus</a>, <a href="/search/cs?searchtype=author&amp;query=Hassidim%2C+A">Avinatan Hassidim</a>, <a href="/search/cs?searchtype=author&amp;query=Matias%2C+Y">Yossi Matias</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&amp;query=Caciularu%2C+A">Avi Caciularu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13632v3-abstract-short" style="display: inline;"> Despite recent advancements in Large Language Models (LLMs), their performance on tasks involving long contexts remains sub-optimal. In-Context Learning (ICL) with few-shot examples may be an appealing solution to enhance LLM performance in this scenario; However, na茂vely adding ICL examples with long context introduces challenges, including substantial token overhead added for each few-shot examp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13632v3-abstract-full').style.display = 'inline'; document.getElementById('2406.13632v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13632v3-abstract-full" style="display: none;"> Despite recent advancements in Large Language Models (LLMs), their performance on tasks involving long contexts remains sub-optimal. In-Context Learning (ICL) with few-shot examples may be an appealing solution to enhance LLM performance in this scenario; However, na茂vely adding ICL examples with long context introduces challenges, including substantial token overhead added for each few-shot example and context mismatch between the demonstrations and the target query. In this work, we propose to automatically generate few-shot examples for long context QA tasks by recycling contexts. Specifically, given a long input context (1-3k tokens) and a query, we generate additional query-output pairs from the given context as few-shot examples, while introducing the context only once. This ensures that the demonstrations are leveraging the same context as the target query while only adding a small number of tokens to the prompt. We further enhance each demonstration by instructing the model to explicitly identify the relevant paragraphs before the answer, which improves performance while providing fine-grained attribution to the answer source. We apply our method on multiple LLMs and obtain substantial improvements (+16 absolute points on average across models) on various QA datasets with long context, especially when the answer lies within the middle of the context. Surprisingly, despite introducing only single-hop ICL examples, LLMs also successfully generalize to multi-hop long-context QA using our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13632v3-abstract-full').style.display = 'none'; document.getElementById('2406.13632v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16908">arXiv:2405.16908</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.16908">pdf</a>, <a href="https://arxiv.org/format/2405.16908">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can Large Language Models Faithfully Express Their Intrinsic Uncertainty in Words? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yona%2C+G">Gal Yona</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Geva%2C+M">Mor Geva</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16908v2-abstract-short" style="display: inline;"> We posit that large language models (LLMs) should be capable of expressing their intrinsic uncertainty in natural language. For example, if the LLM is equally likely to output two contradicting answers to the same question, then its generated response should reflect this uncertainty by hedging its answer (e.g., &#34;I&#39;m not sure, but I think...&#34;). We formalize faithful response uncertainty based on th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16908v2-abstract-full').style.display = 'inline'; document.getElementById('2405.16908v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16908v2-abstract-full" style="display: none;"> We posit that large language models (LLMs) should be capable of expressing their intrinsic uncertainty in natural language. For example, if the LLM is equally likely to output two contradicting answers to the same question, then its generated response should reflect this uncertainty by hedging its answer (e.g., &#34;I&#39;m not sure, but I think...&#34;). We formalize faithful response uncertainty based on the gap between the model&#39;s intrinsic confidence in the assertions it makes and the decisiveness by which they are conveyed. This example-level metric reliably indicates whether the model reflects its uncertainty, as it penalizes both excessive and insufficient hedging. We evaluate a variety of aligned LLMs at faithfully communicating uncertainty on several knowledge-intensive question answering tasks. Our results provide strong evidence that modern LLMs are poor at faithfully conveying their uncertainty, and that better alignment is necessary to improve their trustworthiness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16908v2-abstract-full').style.display = 'none'; document.getElementById('2405.16908v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in EMNLP 2024 (main conference)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05904">arXiv:2405.05904</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.05904">pdf</a>, <a href="https://arxiv.org/format/2405.05904">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gekhman%2C+Z">Zorik Gekhman</a>, <a href="/search/cs?searchtype=author&amp;query=Yona%2C+G">Gal Yona</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Eyal%2C+M">Matan Eyal</a>, <a href="/search/cs?searchtype=author&amp;query=Feder%2C+A">Amir Feder</a>, <a href="/search/cs?searchtype=author&amp;query=Reichart%2C+R">Roi Reichart</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05904v3-abstract-short" style="display: inline;"> When large language models are aligned via supervised fine-tuning, they may encounter new factual information that was not acquired through pre-training. It is often conjectured that this can teach the model the behavior of hallucinating factually incorrect responses, as the model is trained to generate facts that are not grounded in its pre-existing knowledge. In this work, we study the impact of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05904v3-abstract-full').style.display = 'inline'; document.getElementById('2405.05904v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05904v3-abstract-full" style="display: none;"> When large language models are aligned via supervised fine-tuning, they may encounter new factual information that was not acquired through pre-training. It is often conjectured that this can teach the model the behavior of hallucinating factually incorrect responses, as the model is trained to generate facts that are not grounded in its pre-existing knowledge. In this work, we study the impact of such exposure to new knowledge on the capability of the fine-tuned model to utilize its pre-existing knowledge. To this end, we design a controlled setup, focused on closed-book QA, where we vary the proportion of the fine-tuning examples that introduce new knowledge. We demonstrate that large language models struggle to acquire new factual knowledge through fine-tuning, as fine-tuning examples that introduce new knowledge are learned significantly slower than those consistent with the model&#39;s knowledge. However, we also find that as the examples with new knowledge are eventually learned, they linearly increase the model&#39;s tendency to hallucinate. Taken together, our results highlight the risk in introducing new factual knowledge through fine-tuning, and support the view that large language models mostly acquire factual knowledge through pre-training, whereas fine-tuning teaches them to use it more efficiently. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05904v3-abstract-full').style.display = 'none'; document.getElementById('2405.05904v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a long paper at EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.09631">arXiv:2402.09631</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.09631">pdf</a>, <a href="https://arxiv.org/format/2402.09631">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Representation Surgery: Theory and Practice of Affine Steering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Singh%2C+S">Shashwat Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Ravfogel%2C+S">Shauli Ravfogel</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Cotterell%2C+R">Ryan Cotterell</a>, <a href="/search/cs?searchtype=author&amp;query=Kumaraguru%2C+P">Ponnurangam Kumaraguru</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.09631v6-abstract-short" style="display: inline;"> Language models often exhibit undesirable behavior, e.g., generating toxic or gender-biased text. In the case of neural language models, an encoding of the undesirable behavior is often present in the model&#39;s representations. Thus, one natural (and common) approach to prevent the model from exhibiting undesirable behavior is to steer the model&#39;s representations in a manner that reduces the probabi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09631v6-abstract-full').style.display = 'inline'; document.getElementById('2402.09631v6-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.09631v6-abstract-full" style="display: none;"> Language models often exhibit undesirable behavior, e.g., generating toxic or gender-biased text. In the case of neural language models, an encoding of the undesirable behavior is often present in the model&#39;s representations. Thus, one natural (and common) approach to prevent the model from exhibiting undesirable behavior is to steer the model&#39;s representations in a manner that reduces the probability of it generating undesirable text. This paper investigates the formal and empirical properties of steering functions, i.e., transformation of the neural language model&#39;s representations that alter its behavior. First, we derive two optimal, in the least-squares sense, affine steering functions under different constraints. Our theory provides justification for existing approaches and offers a novel, improved steering approach. Second, we offer a series of experiments that demonstrate the empirical effectiveness of the methods in mitigating bias and reducing toxic generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09631v6-abstract-full').style.display = 'none'; document.getElementById('2402.09631v6-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00559">arXiv:2402.00559</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.00559">pdf</a>, <a href="https://arxiv.org/format/2402.00559">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Chain-of-Thought Is as Strong as Its Weakest Link: A Benchmark for Verifiers of Reasoning Chains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jacovi%2C+A">Alon Jacovi</a>, <a href="/search/cs?searchtype=author&amp;query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&amp;query=Bohnet%2C+B">Bernd Bohnet</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Honovich%2C+O">Or Honovich</a>, <a href="/search/cs?searchtype=author&amp;query=Tseng%2C+M">Michael Tseng</a>, <a href="/search/cs?searchtype=author&amp;query=Collins%2C+M">Michael Collins</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Geva%2C+M">Mor Geva</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00559v4-abstract-short" style="display: inline;"> Prompting language models to provide step-by-step answers (e.g., &#34;Chain-of-Thought&#34;) is the prominent approach for complex reasoning tasks, where more accurate reasoning chains typically improve downstream task performance. Recent literature discusses automatic methods to verify reasoning to evaluate and improve their correctness. However, no fine-grained step-level datasets are available to enabl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00559v4-abstract-full').style.display = 'inline'; document.getElementById('2402.00559v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00559v4-abstract-full" style="display: none;"> Prompting language models to provide step-by-step answers (e.g., &#34;Chain-of-Thought&#34;) is the prominent approach for complex reasoning tasks, where more accurate reasoning chains typically improve downstream task performance. Recent literature discusses automatic methods to verify reasoning to evaluate and improve their correctness. However, no fine-grained step-level datasets are available to enable thorough evaluation of such verification methods, hindering progress in this direction. We introduce REVEAL: Reasoning Verification Evaluation, a dataset to benchmark automatic verifiers of complex Chain-of-Thought reasoning in open-domain question-answering settings. REVEAL includes comprehensive labels for the relevance, attribution to evidence passages, and logical correctness of each reasoning step in a language model&#39;s answer, across a variety of datasets and state-of-the-art language models. Evaluation on REVEAL shows that verifiers struggle at verifying reasoning chains - in particular, verifying logical correctness and detecting contradictions. Available at https://reveal-dataset.github.io/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00559v4-abstract-full').style.display = 'none'; document.getElementById('2402.00559v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.04695">arXiv:2401.04695</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.04695">pdf</a>, <a href="https://arxiv.org/format/2401.04695">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Narrowing the Knowledge Evaluation Gap: Open-Domain Question Answering with Multi-Granularity Answers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yona%2C+G">Gal Yona</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Geva%2C+M">Mor Geva</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.04695v2-abstract-short" style="display: inline;"> Factual questions typically can be answered correctly at different levels of granularity. For example, both ``August 4, 1961&#39;&#39; and ``1961&#39;&#39; are correct answers to the question ``When was Barack Obama born?&#39;&#39;. Standard question answering (QA) evaluation protocols, however, do not explicitly take this into account and compare a predicted answer against answers of a single granularity level. In this&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04695v2-abstract-full').style.display = 'inline'; document.getElementById('2401.04695v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.04695v2-abstract-full" style="display: none;"> Factual questions typically can be answered correctly at different levels of granularity. For example, both ``August 4, 1961&#39;&#39; and ``1961&#39;&#39; are correct answers to the question ``When was Barack Obama born?&#39;&#39;. Standard question answering (QA) evaluation protocols, however, do not explicitly take this into account and compare a predicted answer against answers of a single granularity level. In this work, we propose GRANOLA QA, a novel evaluation setting where a predicted answer is evaluated in terms of accuracy and informativeness against a set of multi-granularity answers. We present a simple methodology for enriching existing datasets with multi-granularity answers, and create GRANOLA-EQ, a multi-granularity version of the EntityQuestions dataset. We evaluate a range of decoding methods on GRANOLA-EQ, including a new algorithm, called Decoding with Response Aggregation (DRAG), that is geared towards aligning the response granularity with the model&#39;s uncertainty. Our experiments show that large language models with standard decoding tend to generate specific answers, which are often incorrect. In contrast, when evaluated on multi-granularity answers, DRAG yields a nearly 20 point increase in accuracy on average, which further increases for rare entities. Overall, this reveals that standard evaluation and decoding schemes may significantly underestimate the knowledge encapsulated in LMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04695v2-abstract-full').style.display = 'none'; document.getElementById('2401.04695v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in ACL 2024 Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.01854">arXiv:2401.01854</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.01854">pdf</a>, <a href="https://arxiv.org/format/2401.01854">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multilingual Instruction Tuning With Just a Pinch of Multilinguality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shaham%2C+U">Uri Shaham</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&amp;query=Tsarfaty%2C+R">Reut Tsarfaty</a>, <a href="/search/cs?searchtype=author&amp;query=Eyal%2C+M">Matan Eyal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.01854v4-abstract-short" style="display: inline;"> As instruction-tuned large language models (LLMs) gain global adoption, their ability to follow instructions in multiple languages becomes increasingly crucial. In this work, we investigate how multilinguality during instruction tuning of a multilingual LLM affects instruction-following across languages from the pre-training corpus. We first show that many languages transfer some instruction-follo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01854v4-abstract-full').style.display = 'inline'; document.getElementById('2401.01854v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.01854v4-abstract-full" style="display: none;"> As instruction-tuned large language models (LLMs) gain global adoption, their ability to follow instructions in multiple languages becomes increasingly crucial. In this work, we investigate how multilinguality during instruction tuning of a multilingual LLM affects instruction-following across languages from the pre-training corpus. We first show that many languages transfer some instruction-following capabilities to other languages from even monolingual tuning. Furthermore, we find that only 40 multilingual examples integrated in an English tuning set substantially improve multilingual instruction-following, both in seen and unseen languages during tuning. In general, we observe that models tuned on multilingual mixtures exhibit comparable or superior performance in multiple languages compared to monolingually tuned models, despite training on 10x fewer examples in those languages. Finally, we find that diversifying the instruction tuning set with even just 2-4 languages significantly improves cross-lingual generalization. Our results suggest that building massively multilingual instruction-tuned models can be done with only a very small set of multilingual instruction-responses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01854v4-abstract-full').style.display = 'none'; document.getElementById('2401.01854v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.17670">arXiv:2311.17670</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.17670">pdf</a>, <a href="https://arxiv.org/ps/2311.17670">ps</a>, <a href="https://arxiv.org/format/2311.17670">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> 2-covers of wide Young diagrams </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Ron Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Berger%2C+E">Eli Berger</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">He Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Kotlar%2C+D">Daniel Kotlar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.17670v2-abstract-short" style="display: inline;"> A Young diagram $Y$ is called wide if every sub-diagram $Z$ formed by a subset of the rows of $Y$ dominates $Z&#39;$, the conjugate of $Z$. A Young diagram $Y$ is called Latin if its squares can be assigned numbers so that for each $i$, the $i$th row is filled injectively with the numbers $1, \ldots ,a_i$, where $a_i$ is the length of $i$th row of $Y$, and every column is also filled injectively. A co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17670v2-abstract-full').style.display = 'inline'; document.getElementById('2311.17670v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.17670v2-abstract-full" style="display: none;"> A Young diagram $Y$ is called wide if every sub-diagram $Z$ formed by a subset of the rows of $Y$ dominates $Z&#39;$, the conjugate of $Z$. A Young diagram $Y$ is called Latin if its squares can be assigned numbers so that for each $i$, the $i$th row is filled injectively with the numbers $1, \ldots ,a_i$, where $a_i$ is the length of $i$th row of $Y$, and every column is also filled injectively. A conjecture of Chow and Taylor, publicized by Chow, Fan, Goemans, and Vondrak is that a wide Young diagram is Latin. We prove a dual version of the conjecture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17670v2-abstract-full').style.display = 'none'; document.getElementById('2311.17670v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages; Added a few more questions and a reference</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 05A17; 05C65; 05C70; 05D15 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.10062">arXiv:2310.10062</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.10062">pdf</a>, <a href="https://arxiv.org/format/2310.10062">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Evaluation of Tool-Assisted Generation Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jacovi%2C+A">Alon Jacovi</a>, <a href="/search/cs?searchtype=author&amp;query=Caciularu%2C+A">Avi Caciularu</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Bohnet%2C+B">Bernd Bohnet</a>, <a href="/search/cs?searchtype=author&amp;query=Geva%2C+M">Mor Geva</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.10062v2-abstract-short" style="display: inline;"> A growing area of research investigates augmenting language models with tools (e.g., search engines, calculators) to overcome their shortcomings (e.g., missing or incorrect knowledge, incorrect logical inferences). Various few-shot tool-usage strategies have been proposed. However, there is no systematic and fair comparison across different strategies, or between these strategies and strong baseli&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.10062v2-abstract-full').style.display = 'inline'; document.getElementById('2310.10062v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.10062v2-abstract-full" style="display: none;"> A growing area of research investigates augmenting language models with tools (e.g., search engines, calculators) to overcome their shortcomings (e.g., missing or incorrect knowledge, incorrect logical inferences). Various few-shot tool-usage strategies have been proposed. However, there is no systematic and fair comparison across different strategies, or between these strategies and strong baselines that do not leverage tools. We conduct an extensive empirical analysis, finding that (1) across various datasets, example difficulty levels, and models, strong no-tool baselines are competitive to tool-assisted strategies, implying that effectively using tools with in-context demonstrations is a difficult unsolved problem; (2) for knowledge-retrieval tasks, strategies that *refine* incorrect outputs with tools outperform strategies that retrieve relevant information *ahead of* or *during generation*; (3) tool-assisted strategies are expensive in the number of tokens they require to work -- incurring additional costs by orders of magnitude -- which does not translate into significant improvement in performance. Overall, our findings suggest that few-shot tool integration is still an open challenge, emphasizing the need for comprehensive evaluations of future strategies to accurately assess their *benefits* and *costs*. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.10062v2-abstract-full').style.display = 'none'; document.getElementById('2310.10062v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2023 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.03735">arXiv:2309.03735</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.03735">pdf</a>, <a href="https://arxiv.org/ps/2309.03735">ps</a>, <a href="https://arxiv.org/format/2309.03735">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> Looms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Ron Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Berger%2C+E">Eli Berger</a>, <a href="/search/cs?searchtype=author&amp;query=Briggs%2C+J">Joseph Briggs</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">He Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zerbib%2C+S">Shira Zerbib</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.03735v2-abstract-short" style="display: inline;"> A pair $(A,B)$ of hypergraphs is called orthogonal if $|a \cap b|=1$ for every pair of edges $a \in A$ and $b \in B$. An orthogonal pair of hypergraphs is called a loom if each of its two members is the set of minimum covers of the other. Looms appear naturally in the context of a conjecture of Gy谩rf谩s and Lehel on the covering number of cross-intersecting hypergraphs. We study their properties an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.03735v2-abstract-full').style.display = 'inline'; document.getElementById('2309.03735v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.03735v2-abstract-full" style="display: none;"> A pair $(A,B)$ of hypergraphs is called orthogonal if $|a \cap b|=1$ for every pair of edges $a \in A$ and $b \in B$. An orthogonal pair of hypergraphs is called a loom if each of its two members is the set of minimum covers of the other. Looms appear naturally in the context of a conjecture of Gy谩rf谩s and Lehel on the covering number of cross-intersecting hypergraphs. We study their properties and ways of construction, and prove special cases of a conjecture that if true would imply the Gy谩rf谩s--Lehel conjecture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.03735v2-abstract-full').style.display = 'none'; document.getElementById('2309.03735v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages; Minor revisions; Added a coauthor; To appear in Discrete Mathematics</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 05C65; 05C35; 05C72; 05C76; 05D15 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.00186">arXiv:2306.00186</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.00186">pdf</a>, <a href="https://arxiv.org/format/2306.00186">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Factually Consistent Summarization via Reinforcement Learning with Textual Entailment Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Roit%2C+P">Paul Roit</a>, <a href="/search/cs?searchtype=author&amp;query=Ferret%2C+J">Johan Ferret</a>, <a href="/search/cs?searchtype=author&amp;query=Shani%2C+L">Lior Shani</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Cideron%2C+G">Geoffrey Cideron</a>, <a href="/search/cs?searchtype=author&amp;query=Dadashi%2C+R">Robert Dadashi</a>, <a href="/search/cs?searchtype=author&amp;query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&amp;query=Girgin%2C+S">Sertan Girgin</a>, <a href="/search/cs?searchtype=author&amp;query=Hussenot%2C+L">L茅onard Hussenot</a>, <a href="/search/cs?searchtype=author&amp;query=Keller%2C+O">Orgad Keller</a>, <a href="/search/cs?searchtype=author&amp;query=Momchev%2C+N">Nikola Momchev</a>, <a href="/search/cs?searchtype=author&amp;query=Ramos%2C+S">Sabela Ramos</a>, <a href="/search/cs?searchtype=author&amp;query=Stanczyk%2C+P">Piotr Stanczyk</a>, <a href="/search/cs?searchtype=author&amp;query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&amp;query=Bachem%2C+O">Olivier Bachem</a>, <a href="/search/cs?searchtype=author&amp;query=Elidan%2C+G">Gal Elidan</a>, <a href="/search/cs?searchtype=author&amp;query=Hassidim%2C+A">Avinatan Hassidim</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.00186v1-abstract-short" style="display: inline;"> Despite the seeming success of contemporary grounded text generation systems, they often tend to generate factually inconsistent text with respect to their input. This phenomenon is emphasized in tasks like summarization, in which the generated summaries should be corroborated by their source article. In this work, we leverage recent progress on textual entailment models to directly address this p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00186v1-abstract-full').style.display = 'inline'; document.getElementById('2306.00186v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.00186v1-abstract-full" style="display: none;"> Despite the seeming success of contemporary grounded text generation systems, they often tend to generate factually inconsistent text with respect to their input. This phenomenon is emphasized in tasks like summarization, in which the generated summaries should be corroborated by their source article. In this work, we leverage recent progress on textual entailment models to directly address this problem for abstractive summarization systems. We use reinforcement learning with reference-free, textual entailment rewards to optimize for factual consistency and explore the ensuing trade-offs, as improved consistency may come at the cost of less informative or more extractive summaries. Our results, according to both automatic metrics and human evaluation, show that our method considerably improves the faithfulness, salience, and conciseness of the generated summaries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00186v1-abstract-full').style.display = 'none'; document.getElementById('2306.00186v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14332">arXiv:2305.14332</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.14332">pdf</a>, <a href="https://arxiv.org/format/2305.14332">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Evaluating and Modeling Attribution for Cross-Lingual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Muller%2C+B">Benjamin Muller</a>, <a href="/search/cs?searchtype=author&amp;query=Wieting%2C+J">John Wieting</a>, <a href="/search/cs?searchtype=author&amp;query=Clark%2C+J+H">Jonathan H. Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Kwiatkowski%2C+T">Tom Kwiatkowski</a>, <a href="/search/cs?searchtype=author&amp;query=Ruder%2C+S">Sebastian Ruder</a>, <a href="/search/cs?searchtype=author&amp;query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinyi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14332v2-abstract-short" style="display: inline;"> Trustworthy answer content is abundant in many high-resource languages and is instantly accessible through question answering systems, yet this content can be hard to access for those that do not speak these languages. The leap forward in cross-lingual modeling quality offered by generative language models offers much promise, yet their raw generations often fall short in factuality. To improve tr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14332v2-abstract-full').style.display = 'inline'; document.getElementById('2305.14332v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14332v2-abstract-full" style="display: none;"> Trustworthy answer content is abundant in many high-resource languages and is instantly accessible through question answering systems, yet this content can be hard to access for those that do not speak these languages. The leap forward in cross-lingual modeling quality offered by generative language models offers much promise, yet their raw generations often fall short in factuality. To improve trustworthiness in these systems, a promising direction is to attribute the answer to a retrieved source, possibly in a content-rich language different from the query. Our work is the first to study attribution for cross-lingual question answering. First, we collect data in 5 languages to assess the attribution level of a state-of-the-art cross-lingual QA system. To our surprise, we find that a substantial portion of the answers is not attributable to any retrieved passages (up to 50% of answers exactly matching a gold reference) despite the system being able to attend directly to the retrieved text. Second, to address this poor attribution level, we experiment with a wide range of attribution detection techniques. We find that Natural Language Inference models and PaLM 2 fine-tuned on a very small amount of attribution data can accurately detect attribution. Based on these models, we improve the attribution level of a cross-lingual question-answering system. Overall, we show that current academic generative cross-lingual QA systems have substantial shortcomings in attribution and we build tooling to mitigate these issues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14332v2-abstract-full').style.display = 'none'; document.getElementById('2305.14332v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a long paper at EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13194">arXiv:2305.13194</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.13194">pdf</a>, <a href="https://arxiv.org/format/2305.13194">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SEAHORSE: A Multilingual, Multifaceted Dataset for Summarization Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Clark%2C+E">Elizabeth Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Rijhwani%2C+S">Shruti Rijhwani</a>, <a href="/search/cs?searchtype=author&amp;query=Gehrmann%2C+S">Sebastian Gehrmann</a>, <a href="/search/cs?searchtype=author&amp;query=Maynez%2C+J">Joshua Maynez</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Nikolaev%2C+V">Vitaly Nikolaev</a>, <a href="/search/cs?searchtype=author&amp;query=Sellam%2C+T">Thibault Sellam</a>, <a href="/search/cs?searchtype=author&amp;query=Siddhant%2C+A">Aditya Siddhant</a>, <a href="/search/cs?searchtype=author&amp;query=Das%2C+D">Dipanjan Das</a>, <a href="/search/cs?searchtype=author&amp;query=Parikh%2C+A+P">Ankur P. Parikh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13194v2-abstract-short" style="display: inline;"> Reliable automatic evaluation of summarization systems is challenging due to the multifaceted and subjective nature of the task. This is especially the case for languages other than English, where human evaluations are scarce. In this work, we introduce SEAHORSE, a dataset for multilingual, multifaceted summarization evaluation. SEAHORSE consists of 96K summaries with human ratings along 6 dimensi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13194v2-abstract-full').style.display = 'inline'; document.getElementById('2305.13194v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13194v2-abstract-full" style="display: none;"> Reliable automatic evaluation of summarization systems is challenging due to the multifaceted and subjective nature of the task. This is especially the case for languages other than English, where human evaluations are scarce. In this work, we introduce SEAHORSE, a dataset for multilingual, multifaceted summarization evaluation. SEAHORSE consists of 96K summaries with human ratings along 6 dimensions of text quality: comprehensibility, repetition, grammar, attribution, main ideas, and conciseness, covering 6 languages, 9 systems and 4 datasets. As a result of its size and scope, SEAHORSE can serve both as a benchmark to evaluate learnt metrics, as well as a large-scale resource for training such metrics. We show that metrics trained with SEAHORSE achieve strong performance on the out-of-domain meta-evaluation benchmarks TRUE (Honovich et al., 2022) and mFACE (Aharoni et al., 2022). We make the SEAHORSE dataset and metrics publicly available for future research on multilingual and multifaceted summarization evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13194v2-abstract-full').style.display = 'none'; document.getElementById('2305.13194v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.11171">arXiv:2305.11171</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.11171">pdf</a>, <a href="https://arxiv.org/format/2305.11171">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TrueTeacher: Learning Factual Consistency Evaluation with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gekhman%2C+Z">Zorik Gekhman</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Elkind%2C+C">Chen Elkind</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.11171v3-abstract-short" style="display: inline;"> Factual consistency evaluation is often conducted using Natural Language Inference (NLI) models, yet these models exhibit limited success in evaluating summaries. Previous work improved such models with synthetic training data. However, the data is typically based on perturbed human-written summaries, which often differ in their characteristics from real model-generated summaries and have limited&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11171v3-abstract-full').style.display = 'inline'; document.getElementById('2305.11171v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.11171v3-abstract-full" style="display: none;"> Factual consistency evaluation is often conducted using Natural Language Inference (NLI) models, yet these models exhibit limited success in evaluating summaries. Previous work improved such models with synthetic training data. However, the data is typically based on perturbed human-written summaries, which often differ in their characteristics from real model-generated summaries and have limited coverage of possible factual errors. Alternatively, large language models (LLMs) have recently shown promising results in directly evaluating generative tasks, but are too computationally expensive for practical use. Motivated by these limitations, we introduce TrueTeacher, a method for generating synthetic data by annotating diverse model-generated summaries using a LLM. Unlike prior work, TrueTeacher does not rely on human-written summaries, and is multilingual by nature. Experiments on the TRUE benchmark show that a student model trained using our data, substantially outperforms both the state-of-the-art model with similar capacity, and the LLM teacher. In a systematic study, we compare TrueTeacher to existing synthetic data generation methods and demonstrate its superiority and robustness to domain-shift. We also show that our method generalizes to multilingual scenarios. Lastly, we release our large scale synthetic dataset (1.4M examples), generated using TrueTeacher, and a checkpoint trained on this data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11171v3-abstract-full').style.display = 'none'; document.getElementById('2305.11171v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a long paper in EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.10400">arXiv:2305.10400</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.10400">pdf</a>, <a href="https://arxiv.org/format/2305.10400">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> What You See is What You Read? Improving Text-Image Alignment Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yarom%2C+M">Michal Yarom</a>, <a href="/search/cs?searchtype=author&amp;query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&amp;query=Changpinyo%2C+S">Soravit Changpinyo</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Lang%2C+O">Oran Lang</a>, <a href="/search/cs?searchtype=author&amp;query=Ofek%2C+E">Eran Ofek</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.10400v4-abstract-short" style="display: inline;"> Automatically determining whether a text and a corresponding image are semantically aligned is a significant challenge for vision-language models, with applications in generative text-to-image and image-to-text tasks. In this work, we study methods for automatic text-image alignment evaluation. We first introduce SeeTRUE: a comprehensive evaluation set, spanning multiple datasets from both text-to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10400v4-abstract-full').style.display = 'inline'; document.getElementById('2305.10400v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.10400v4-abstract-full" style="display: none;"> Automatically determining whether a text and a corresponding image are semantically aligned is a significant challenge for vision-language models, with applications in generative text-to-image and image-to-text tasks. In this work, we study methods for automatic text-image alignment evaluation. We first introduce SeeTRUE: a comprehensive evaluation set, spanning multiple datasets from both text-to-image and image-to-text generation tasks, with human judgements for whether a given text-image pair is semantically aligned. We then describe two automatic methods to determine alignment: the first involving a pipeline based on question generation and visual question answering models, and the second employing an end-to-end classification approach by finetuning multimodal pretrained models. Both methods surpass prior approaches in various text-image alignment tasks, with significant improvements in challenging cases that involve complex composition or unnatural images. Finally, we demonstrate how our approaches can localize specific misalignments between an image and a given text, and how they can be used to automatically re-rank candidates in text-to-image generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10400v4-abstract-full').style.display = 'none'; document.getElementById('2305.10400v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2023. Website: https://wysiwyr-itm.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.07378">arXiv:2305.07378</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.07378">pdf</a>, <a href="https://arxiv.org/format/2305.07378">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Surfacing Biases in Large Language Models using Contrastive Input Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yona%2C+G">Gal Yona</a>, <a href="/search/cs?searchtype=author&amp;query=Honovich%2C+O">Or Honovich</a>, <a href="/search/cs?searchtype=author&amp;query=Laish%2C+I">Itay Laish</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.07378v1-abstract-short" style="display: inline;"> Ensuring that large language models (LMs) are fair, robust and useful requires an understanding of how different modifications to their inputs impact the model&#39;s behaviour. In the context of open-text generation tasks, however, such an evaluation is not trivial. For example, when introducing a model with an input text and a perturbed, &#34;contrastive&#34; version of it, meaningful differences in the next&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07378v1-abstract-full').style.display = 'inline'; document.getElementById('2305.07378v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.07378v1-abstract-full" style="display: none;"> Ensuring that large language models (LMs) are fair, robust and useful requires an understanding of how different modifications to their inputs impact the model&#39;s behaviour. In the context of open-text generation tasks, however, such an evaluation is not trivial. For example, when introducing a model with an input text and a perturbed, &#34;contrastive&#34; version of it, meaningful differences in the next-token predictions may not be revealed with standard decoding strategies. With this motivation in mind, we propose Contrastive Input Decoding (CID): a decoding algorithm to generate text given two inputs, where the generated text is likely given one input but unlikely given the other. In this way, the contrastive generations can highlight potentially subtle differences in how the LM output differs for the two inputs in a simple and interpretable manner. We use CID to highlight context-specific biases that are hard to detect with standard decoding strategies and quantify the effect of different input perturbations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07378v1-abstract-full').style.display = 'none'; document.getElementById('2305.07378v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.14318">arXiv:2304.14318</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.14318">pdf</a>, <a href="https://arxiv.org/format/2304.14318">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> q2d: Turning Questions into Dialogs to Teach Models How to Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen-Ganor%2C+S">Shlomi Cohen-Ganor</a>, <a href="/search/cs?searchtype=author&amp;query=Hakimi%2C+I">Ido Hakimi</a>, <a href="/search/cs?searchtype=author&amp;query=Lewenberg%2C+Y">Yoad Lewenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Weinreb%2C+E">Enav Weinreb</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.14318v2-abstract-short" style="display: inline;"> One of the exciting capabilities of recent language models for dialog is their ability to independently search for relevant information to ground a given dialog response. However, obtaining training data to teach models how to issue search queries is time and resource consuming. In this work, we propose q2d: an automatic data generation pipeline that generates information-seeking dialogs from ques&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14318v2-abstract-full').style.display = 'inline'; document.getElementById('2304.14318v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.14318v2-abstract-full" style="display: none;"> One of the exciting capabilities of recent language models for dialog is their ability to independently search for relevant information to ground a given dialog response. However, obtaining training data to teach models how to issue search queries is time and resource consuming. In this work, we propose q2d: an automatic data generation pipeline that generates information-seeking dialogs from questions. We prompt a large language model (PaLM) to create conversational versions of question answering datasets, and use it to improve query generation models that communicate with external search APIs to ground dialog responses. Unlike previous approaches which relied on human written dialogs with search queries, our method allows to automatically generate query-based grounded dialogs with better control and scale. Our experiments demonstrate that: (1) For query generation on the QReCC dataset, models trained on our synthetically-generated data achieve 90%--97% of the performance of models trained on the human-generated data; (2) We can successfully generate data for training dialog models in new domains without any existing dialog data as demonstrated on the multi-hop MuSiQue and Bamboogle QA datasets. (3) We perform a thorough analysis of the generated dialogs showing that humans find them of high quality and struggle to distinguish them from human-written dialogs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14318v2-abstract-full').style.display = 'none'; document.getElementById('2304.14318v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2023. Website: https://question2dialog.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.10312">arXiv:2301.10312</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.10312">pdf</a>, <a href="https://arxiv.org/ps/2301.10312">ps</a>, <a href="https://arxiv.org/format/2301.10312">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> Tight infinite matrices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Ron Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">He Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.10312v1-abstract-short" style="display: inline;"> We give a simple proof of a recent result of Gollin and Jo贸: if a possibly infinite system of homogeneous linear equations $A\vec{x} = \vec{0}$, where $A = (a_{i, j})$ is an $I \times J$ matrix, has only the trivial solution, then there exists an injection $蠁: J \to I$, such that $a_{蠁(j), j} \neq 0$ for all $j \in J$. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.10312v1-abstract-full" style="display: none;"> We give a simple proof of a recent result of Gollin and Jo贸: if a possibly infinite system of homogeneous linear equations $A\vec{x} = \vec{0}$, where $A = (a_{i, j})$ is an $I \times J$ matrix, has only the trivial solution, then there exists an injection $蠁: J \to I$, such that $a_{蠁(j), j} \neq 0$ for all $j \in J$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10312v1-abstract-full').style.display = 'none'; document.getElementById('2301.10312v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 15A06; 05C50; 05C63 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.10622">arXiv:2212.10622</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.10622">pdf</a>, <a href="https://arxiv.org/format/2212.10622">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> mFACE: Multilingual Summarization with Factual Consistency Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Narayan%2C+S">Shashi Narayan</a>, <a href="/search/cs?searchtype=author&amp;query=Maynez%2C+J">Joshua Maynez</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Clark%2C+E">Elizabeth Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Lapata%2C+M">Mirella Lapata</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.10622v2-abstract-short" style="display: inline;"> Abstractive summarization has enjoyed renewed interest in recent years, thanks to pre-trained language models and the availability of large-scale datasets. Despite promising results, current models still suffer from generating factually inconsistent summaries, reducing their utility for real-world application. Several recent efforts attempt to address this by devising models that automatically det&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10622v2-abstract-full').style.display = 'inline'; document.getElementById('2212.10622v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.10622v2-abstract-full" style="display: none;"> Abstractive summarization has enjoyed renewed interest in recent years, thanks to pre-trained language models and the availability of large-scale datasets. Despite promising results, current models still suffer from generating factually inconsistent summaries, reducing their utility for real-world application. Several recent efforts attempt to address this by devising models that automatically detect factual inconsistencies in machine generated summaries. However, they focus exclusively on English, a language with abundant resources. In this work, we leverage factual consistency evaluation models to improve multilingual summarization. We explore two intuitive approaches to mitigate hallucinations based on the signal provided by a multilingual NLI model, namely data filtering and controlled generation. Experimental results in the 45 languages from the XLSum dataset show gains over strong baselines in both automatic and human evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10622v2-abstract-full').style.display = 'none'; document.getElementById('2212.10622v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages with links to released data</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.09682">arXiv:2212.09682</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.09682">pdf</a>, <a href="https://arxiv.org/format/2212.09682">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multilingual Sequence-to-Sequence Models for Hebrew NLP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Eyal%2C+M">Matan Eyal</a>, <a href="/search/cs?searchtype=author&amp;query=Noga%2C+H">Hila Noga</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&amp;query=Tsarfaty%2C+R">Reut Tsarfaty</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.09682v1-abstract-short" style="display: inline;"> Recent work attributes progress in NLP to large language models (LMs) with increased model size and large quantities of pretraining data. Despite this, current state-of-the-art LMs for Hebrew are both under-parameterized and under-trained compared to LMs in other languages. Additionally, previous work on pretrained Hebrew LMs focused on encoder-only models. While the encoder-only architecture is b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09682v1-abstract-full').style.display = 'inline'; document.getElementById('2212.09682v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.09682v1-abstract-full" style="display: none;"> Recent work attributes progress in NLP to large language models (LMs) with increased model size and large quantities of pretraining data. Despite this, current state-of-the-art LMs for Hebrew are both under-parameterized and under-trained compared to LMs in other languages. Additionally, previous work on pretrained Hebrew LMs focused on encoder-only models. While the encoder-only architecture is beneficial for classification tasks, it does not cater well for sub-word prediction tasks, such as Named Entity Recognition, when considering the morphologically rich nature of Hebrew. In this paper we argue that sequence-to-sequence generative architectures are more suitable for LLMs in the case of morphologically rich languages (MRLs) such as Hebrew. We demonstrate that by casting tasks in the Hebrew NLP pipeline as text-to-text tasks, we can leverage powerful multilingual, pretrained sequence-to-sequence models as mT5, eliminating the need for a specialized, morpheme-based, separately fine-tuned decoder. Using this approach, our experiments show substantial improvements over previously published results on existing Hebrew NLP benchmarks. These results suggest that multilingual sequence-to-sequence models present a promising building block for NLP for MRLs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09682v1-abstract-full').style.display = 'none'; document.getElementById('2212.09682v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08037">arXiv:2212.08037</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.08037">pdf</a>, <a href="https://arxiv.org/format/2212.08037">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Attributed Question Answering: Evaluation and Modeling for Attributed Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bohnet%2C+B">Bernd Bohnet</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Verga%2C+P">Pat Verga</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Andor%2C+D">Daniel Andor</a>, <a href="/search/cs?searchtype=author&amp;query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&amp;query=Ciaramita%2C+M">Massimiliano Ciaramita</a>, <a href="/search/cs?searchtype=author&amp;query=Eisenstein%2C+J">Jacob Eisenstein</a>, <a href="/search/cs?searchtype=author&amp;query=Ganchev%2C+K">Kuzman Ganchev</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Hui%2C+K">Kai Hui</a>, <a href="/search/cs?searchtype=author&amp;query=Kwiatkowski%2C+T">Tom Kwiatkowski</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Ji Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+J">Jianmo Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Saralegui%2C+L+S">Lierni Sestorain Saralegui</a>, <a href="/search/cs?searchtype=author&amp;query=Schuster%2C+T">Tal Schuster</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+W+W">William W. Cohen</a>, <a href="/search/cs?searchtype=author&amp;query=Collins%2C+M">Michael Collins</a>, <a href="/search/cs?searchtype=author&amp;query=Das%2C+D">Dipanjan Das</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a>, <a href="/search/cs?searchtype=author&amp;query=Petrov%2C+S">Slav Petrov</a>, <a href="/search/cs?searchtype=author&amp;query=Webster%2C+K">Kellie Webster</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08037v2-abstract-short" style="display: inline;"> Large language models (LLMs) have shown impressive results while requiring little or no direct supervision. Further, there is mounting evidence that LLMs may have potential in information-seeking scenarios. We believe the ability of an LLM to attribute the text that it generates is likely to be crucial in this setting. We formulate and study Attributed QA as a key first step in the development of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08037v2-abstract-full').style.display = 'inline'; document.getElementById('2212.08037v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08037v2-abstract-full" style="display: none;"> Large language models (LLMs) have shown impressive results while requiring little or no direct supervision. Further, there is mounting evidence that LLMs may have potential in information-seeking scenarios. We believe the ability of an LLM to attribute the text that it generates is likely to be crucial in this setting. We formulate and study Attributed QA as a key first step in the development of attributed LLMs. We propose a reproducible evaluation framework for the task and benchmark a broad set of architectures. We take human annotations as a gold standard and show that a correlated automatic metric is suitable for development. Our experimental work gives concrete answers to two key questions (How to measure attribution?, and How well do current state-of-the-art methods perform on attribution?), and give some hints as to how to address a third (How to build LLMs with attribution?). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08037v2-abstract-full').style.display = 'none'; document.getElementById('2212.08037v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.05655">arXiv:2211.05655</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.05655">pdf</a>, <a href="https://arxiv.org/format/2211.05655">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DisentQA: Disentangling Parametric and Contextual Knowledge with Counterfactual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Neeman%2C+E">Ella Neeman</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Honovich%2C+O">Or Honovich</a>, <a href="/search/cs?searchtype=author&amp;query=Choshen%2C+L">Leshem Choshen</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&amp;query=Abend%2C+O">Omri Abend</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.05655v1-abstract-short" style="display: inline;"> Question answering models commonly have access to two sources of &#34;knowledge&#34; during inference time: (1) parametric knowledge - the factual knowledge encoded in the model weights, and (2) contextual knowledge - external knowledge (e.g., a Wikipedia passage) given to the model to generate a grounded answer. Having these two sources of knowledge entangled together is a core issue for generative QA mo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05655v1-abstract-full').style.display = 'inline'; document.getElementById('2211.05655v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.05655v1-abstract-full" style="display: none;"> Question answering models commonly have access to two sources of &#34;knowledge&#34; during inference time: (1) parametric knowledge - the factual knowledge encoded in the model weights, and (2) contextual knowledge - external knowledge (e.g., a Wikipedia passage) given to the model to generate a grounded answer. Having these two sources of knowledge entangled together is a core issue for generative QA models as it is unclear whether the answer stems from the given non-parametric knowledge or not. This unclarity has implications on issues of trust, interpretability and factuality. In this work, we propose a new paradigm in which QA models are trained to disentangle the two sources of knowledge. Using counterfactual data augmentation, we introduce a model that predicts two answers for a given question: one based on given contextual knowledge and one based on parametric knowledge. Our experiments on the Natural Questions dataset show that this approach improves the performance of QA models by making them more robust to knowledge conflicts between the two knowledge sources, while generating useful disentangled answers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05655v1-abstract-full').style.display = 'none'; document.getElementById('2211.05655v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.04991">arXiv:2204.04991</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.04991">pdf</a>, <a href="https://arxiv.org/format/2204.04991">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TRUE: Re-evaluating Factual Consistency Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Honovich%2C+O">Or Honovich</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Taitelbaum%2C+H">Hagai Taitelbaum</a>, <a href="/search/cs?searchtype=author&amp;query=Kukliansy%2C+D">Doron Kukliansy</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+V">Vered Cohen</a>, <a href="/search/cs?searchtype=author&amp;query=Scialom%2C+T">Thomas Scialom</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&amp;query=Hassidim%2C+A">Avinatan Hassidim</a>, <a href="/search/cs?searchtype=author&amp;query=Matias%2C+Y">Yossi Matias</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.04991v3-abstract-short" style="display: inline;"> Grounded text generation systems often generate text that contains factual inconsistencies, hindering their real-world applicability. Automatic factual consistency evaluation may help alleviate this limitation by accelerating evaluation cycles, filtering inconsistent outputs and augmenting training data. While attracting increasing attention, such evaluation metrics are usually developed and evalu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.04991v3-abstract-full').style.display = 'inline'; document.getElementById('2204.04991v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.04991v3-abstract-full" style="display: none;"> Grounded text generation systems often generate text that contains factual inconsistencies, hindering their real-world applicability. Automatic factual consistency evaluation may help alleviate this limitation by accelerating evaluation cycles, filtering inconsistent outputs and augmenting training data. While attracting increasing attention, such evaluation metrics are usually developed and evaluated in silo for a single task or dataset, slowing their adoption. Moreover, previous meta-evaluation protocols focused on system-level correlations with human annotations, which leave the example-level accuracy of such metrics unclear. In this work, we introduce TRUE: a comprehensive survey and assessment of factual consistency metrics on a standardized collection of existing texts from diverse tasks, manually annotated for factual consistency. Our standardization enables an example-level meta-evaluation protocol that is more actionable and interpretable than previously reported correlations, yielding clearer quality measures. Across diverse state-of-the-art metrics and 11 datasets we find that large-scale NLI and question generation-and-answering-based approaches achieve strong and complementary results. We recommend those methods as a starting point for model and metric developers, and hope TRUE will foster progress towards even better evaluation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.04991v3-abstract-full').style.display = 'none'; document.getElementById('2204.04991v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a long paper to NAACL 2022 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.14332">arXiv:2110.14332</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.14332">pdf</a>, <a href="https://arxiv.org/format/2110.14332">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s11856-023-2502-z">10.1007/s11856-023-2502-z <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Rainbow cycles for families of matchings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Ron Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">He Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.14332v3-abstract-short" style="display: inline;"> Given a graph $G$ and a coloring of its edges, a subgraph of $G$ is called rainbow if its edges have distinct colors. The rainbow girth of an edge coloring of G is the minimum length of a rainbow cycle in G. A generalization of the famous Caccetta-H盲ggkvist conjecture, proposed by the first author, is that if in an coloring of the edge set of an $n$-vertex graph by $n$ colors, in which each color&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.14332v3-abstract-full').style.display = 'inline'; document.getElementById('2110.14332v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.14332v3-abstract-full" style="display: none;"> Given a graph $G$ and a coloring of its edges, a subgraph of $G$ is called rainbow if its edges have distinct colors. The rainbow girth of an edge coloring of G is the minimum length of a rainbow cycle in G. A generalization of the famous Caccetta-H盲ggkvist conjecture, proposed by the first author, is that if in an coloring of the edge set of an $n$-vertex graph by $n$ colors, in which each color class is of size $k$, the rainbow girth is at most $\lceil \frac{n}{k} \rceil$. In the known examples for sharpness of this conjecture the color classes are stars, suggesting that when the color classes are matchings, the result may be improved. We show that the rainbow girth of $n$ matchings of size at least 2 is $O(\log n)$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.14332v3-abstract-full').style.display = 'none'; document.getElementById('2110.14332v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages; minor edits; to appear in Israel Journal of Mathematics</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 05C35; 05D40 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Israel Journal of Mathematics 256 (2023), 1--8 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.08202">arXiv:2104.08202</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.08202">pdf</a>, <a href="https://arxiv.org/format/2104.08202">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> $Q^{2}$: Evaluating Factual Consistency in Knowledge-Grounded Dialogues via Question Generation and Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Honovich%2C+O">Or Honovich</a>, <a href="/search/cs?searchtype=author&amp;query=Choshen%2C+L">Leshem Choshen</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Neeman%2C+E">Ella Neeman</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&amp;query=Abend%2C+O">Omri Abend</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.08202v2-abstract-short" style="display: inline;"> Neural knowledge-grounded generative models for dialogue often produce content that is factually inconsistent with the knowledge they rely on, making them unreliable and limiting their applicability. Inspired by recent work on evaluating factual consistency in abstractive summarization, we propose an automatic evaluation metric for factual consistency in knowledge-grounded dialogue using automatic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.08202v2-abstract-full').style.display = 'inline'; document.getElementById('2104.08202v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.08202v2-abstract-full" style="display: none;"> Neural knowledge-grounded generative models for dialogue often produce content that is factually inconsistent with the knowledge they rely on, making them unreliable and limiting their applicability. Inspired by recent work on evaluating factual consistency in abstractive summarization, we propose an automatic evaluation metric for factual consistency in knowledge-grounded dialogue using automatic question generation and question answering. Our metric, denoted $Q^2$, compares answer spans using natural language inference (NLI), instead of token-based matching as done in previous work. To foster proper evaluation, we curate a novel dataset of dialogue system outputs for the Wizard-of-Wikipedia dataset, manually annotated for factual consistency. We perform a thorough meta-evaluation of $Q^2$ against other metrics using this dataset and two others, where it consistently shows higher correlation with human judgements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.08202v2-abstract-full').style.display = 'none'; document.getElementById('2104.08202v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.11027">arXiv:2009.11027</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2009.11027">pdf</a>, <a href="https://arxiv.org/format/2009.11027">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> KoBE: Knowledge-Based Machine Translation Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gekhman%2C+Z">Zorik Gekhman</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Beryozkin%2C+G">Genady Beryozkin</a>, <a href="/search/cs?searchtype=author&amp;query=Freitag%2C+M">Markus Freitag</a>, <a href="/search/cs?searchtype=author&amp;query=Macherey%2C+W">Wolfgang Macherey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.11027v1-abstract-short" style="display: inline;"> We propose a simple and effective method for machine translation evaluation which does not require reference translations. Our approach is based on (1) grounding the entity mentions found in each source sentence and candidate translation against a large-scale multilingual knowledge base, and (2) measuring the recall of the grounded entities found in the candidate vs. those found in the source. Our&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.11027v1-abstract-full').style.display = 'inline'; document.getElementById('2009.11027v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.11027v1-abstract-full" style="display: none;"> We propose a simple and effective method for machine translation evaluation which does not require reference translations. Our approach is based on (1) grounding the entity mentions found in each source sentence and candidate translation against a large-scale multilingual knowledge base, and (2) measuring the recall of the grounded entities found in the candidate vs. those found in the source. Our approach achieves the highest correlation with human judgements on 9 out of the 18 language pairs from the WMT19 benchmark for evaluation without references, which is the largest number of wins for a single evaluation method on this task. On 4 language pairs, we also achieve higher correlation with human judgements than BLEU. To foster further research, we release a dataset containing 1.8 million grounded entity mentions across 18 language pairs from the WMT19 metrics track data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.11027v1-abstract-full').style.display = 'none'; document.getElementById('2009.11027v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a short paper in Findings of EMNLP 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.04637">arXiv:2008.04637</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2008.04637">pdf</a>, <a href="https://arxiv.org/format/2008.04637">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Sign Language Detection using Human Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Moryossef%2C+A">Amit Moryossef</a>, <a href="/search/cs?searchtype=author&amp;query=Tsochantaridis%2C+I">Ioannis Tsochantaridis</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Ebling%2C+S">Sarah Ebling</a>, <a href="/search/cs?searchtype=author&amp;query=Narayanan%2C+S">Srini Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.04637v2-abstract-short" style="display: inline;"> We propose a lightweight real-time sign language detection model, as we identify the need for such a case in videoconferencing. We extract optical flow features based on human pose estimation and, using a linear classifier, show these features are meaningful with an accuracy of 80%, evaluated on the DGS Corpus. Using a recurrent model directly on the input, we see improvements of up to 91% accurac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.04637v2-abstract-full').style.display = 'inline'; document.getElementById('2008.04637v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.04637v2-abstract-full" style="display: none;"> We propose a lightweight real-time sign language detection model, as we identify the need for such a case in videoconferencing. We extract optical flow features based on human pose estimation and, using a linear classifier, show these features are meaningful with an accuracy of 80%, evaluated on the DGS Corpus. Using a recurrent model directly on the input, we see improvements of up to 91% accuracy, while still working under 4ms. We describe a demo application to sign language detection in the browser in order to demonstrate its usage possibility in videoconferencing applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.04637v2-abstract-full').style.display = 'none'; document.getElementById('2008.04637v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.02105">arXiv:2004.02105</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2004.02105">pdf</a>, <a href="https://arxiv.org/format/2004.02105">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Domain Clusters in Pretrained Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Goldberg%2C+Y">Yoav Goldberg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.02105v2-abstract-short" style="display: inline;"> The notion of &#34;in-domain data&#34; in NLP is often over-simplistic and vague, as textual data varies in many nuanced linguistic aspects such as topic, style or level of formality. In addition, domain labels are many times unavailable, making it challenging to build domain-specific systems. We show that massive pre-trained language models implicitly learn sentence representations that cluster by domain&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.02105v2-abstract-full').style.display = 'inline'; document.getElementById('2004.02105v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.02105v2-abstract-full" style="display: none;"> The notion of &#34;in-domain data&#34; in NLP is often over-simplistic and vague, as textual data varies in many nuanced linguistic aspects such as topic, style or level of formality. In addition, domain labels are many times unavailable, making it challenging to build domain-specific systems. We show that massive pre-trained language models implicitly learn sentence representations that cluster by domains without supervision -- suggesting a simple data-driven definition of domains in textual data. We harness this property and propose domain data selection methods based on such models, which require only a small set of in-domain monolingual data. We evaluate our data selection methods for neural machine translation across five diverse domains, where they outperform an established approach as measured by both BLEU and by precision and recall of sentence selection with respect to an oracle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.02105v2-abstract-full').style.display = 'none'; document.getElementById('2004.02105v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a long paper in ACL 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.09302">arXiv:1910.09302</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.09302">pdf</a>, <a href="https://arxiv.org/format/1910.09302">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Diversify Your Datasets: Analyzing Generalization via Controlled Variance in Adversarial Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rozen%2C+O">Ohad Rozen</a>, <a href="/search/cs?searchtype=author&amp;query=Shwartz%2C+V">Vered Shwartz</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Dagan%2C+I">Ido Dagan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.09302v1-abstract-short" style="display: inline;"> Phenomenon-specific &#34;adversarial&#34; datasets have been recently designed to perform targeted stress-tests for particular inference types. Recent work (Liu et al., 2019a) proposed that such datasets can be utilized for training NLI and other types of models, often allowing to learn the phenomenon in focus and improve on the challenge dataset, indicating a &#34;blind spot&#34; in the original training data. Y&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.09302v1-abstract-full').style.display = 'inline'; document.getElementById('1910.09302v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.09302v1-abstract-full" style="display: none;"> Phenomenon-specific &#34;adversarial&#34; datasets have been recently designed to perform targeted stress-tests for particular inference types. Recent work (Liu et al., 2019a) proposed that such datasets can be utilized for training NLI and other types of models, often allowing to learn the phenomenon in focus and improve on the challenge dataset, indicating a &#34;blind spot&#34; in the original training data. Yet, although a model can improve in such a training process, it might still be vulnerable to other challenge datasets targeting the same phenomenon but drawn from a different distribution, such as having a different syntactic complexity level. In this work, we extend this method to drive conclusions about a model&#39;s ability to learn and generalize a target phenomenon rather than to &#34;learn&#34; a dataset, by controlling additional aspects in the adversarial datasets. We demonstrate our approach on two inference phenomena - dative alternation and numerical reasoning, elaborating, and in some cases contradicting, the results of Liu et al.. Our methodology enables building better challenge datasets for creating more robust models, and may yield better model understanding and subsequent overarching improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.09302v1-abstract-full').style.display = 'none'; document.getElementById('1910.09302v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CoNLL 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.07091">arXiv:1903.07091</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1903.07091">pdf</a>, <a href="https://arxiv.org/format/1903.07091">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Missing Ingredient in Zero-Shot Neural Machine Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Arivazhagan%2C+N">Naveen Arivazhagan</a>, <a href="/search/cs?searchtype=author&amp;query=Bapna%2C+A">Ankur Bapna</a>, <a href="/search/cs?searchtype=author&amp;query=Firat%2C+O">Orhan Firat</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Johnson%2C+M">Melvin Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Macherey%2C+W">Wolfgang Macherey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.07091v1-abstract-short" style="display: inline;"> Multilingual Neural Machine Translation (NMT) models are capable of translating between multiple source and target languages. Despite various approaches to train such models, they have difficulty with zero-shot translation: translating between language pairs that were not together seen during training. In this paper we first diagnose why state-of-the-art multilingual NMT models that rely purely on&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.07091v1-abstract-full').style.display = 'inline'; document.getElementById('1903.07091v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.07091v1-abstract-full" style="display: none;"> Multilingual Neural Machine Translation (NMT) models are capable of translating between multiple source and target languages. Despite various approaches to train such models, they have difficulty with zero-shot translation: translating between language pairs that were not together seen during training. In this paper we first diagnose why state-of-the-art multilingual NMT models that rely purely on parameter sharing, fail to generalize to unseen language pairs. We then propose auxiliary losses on the NMT encoder that impose representational invariance across languages. Our simple approach vastly improves zero-shot translation quality without regressing on supervised directions. For the first time, on WMT14 English-FrenchGerman, we achieve zero-shot performance that is on par with pivoting. We also demonstrate the easy scalability of our approach to multiple languages on the IWSLT 2017 shared task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.07091v1-abstract-full').style.display = 'none'; document.getElementById('1903.07091v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.03467">arXiv:1903.03467</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1903.03467">pdf</a>, <a href="https://arxiv.org/format/1903.03467">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Filling Gender &amp; Number Gaps in Neural Machine Translation with Black-box Context Injection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Moryossef%2C+A">Amit Moryossef</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Goldberg%2C+Y">Yoav Goldberg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.03467v1-abstract-short" style="display: inline;"> When translating from a language that does not morphologically mark information such as gender and number into a language that does, translation systems must &#34;guess&#34; this missing information, often leading to incorrect translations in the given context. We propose a black-box approach for injecting the missing information to a pre-trained neural machine translation system, allowing to control the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.03467v1-abstract-full').style.display = 'inline'; document.getElementById('1903.03467v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.03467v1-abstract-full" style="display: none;"> When translating from a language that does not morphologically mark information such as gender and number into a language that does, translation systems must &#34;guess&#34; this missing information, often leading to incorrect translations in the given context. We propose a black-box approach for injecting the missing information to a pre-trained neural machine translation system, allowing to control the morphological variations in the generated translations without changing the underlying model or training data. We evaluate our method on an English to Hebrew translation task, and show that it is effective in injecting the gender and number information and that supplying the correct information improves the translation accuracy in up to 2.3 BLEU on a female-speaker test set for a state-of-the-art online black-box system. Finally, we perform a fine-grained syntactic analysis of the generated translations that shows the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.03467v1-abstract-full').style.display = 'none'; document.getElementById('1903.03467v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.00089">arXiv:1903.00089</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1903.00089">pdf</a>, <a href="https://arxiv.org/format/1903.00089">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Massively Multilingual Neural Machine Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Johnson%2C+M">Melvin Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Firat%2C+O">Orhan Firat</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.00089v3-abstract-short" style="display: inline;"> Multilingual neural machine translation (NMT) enables training a single model that supports translation from multiple source languages into multiple target languages. In this paper, we push the limits of multilingual NMT in terms of number of languages being used. We perform extensive experiments in training massively multilingual NMT models, translating up to 102 languages to and from English wit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.00089v3-abstract-full').style.display = 'inline'; document.getElementById('1903.00089v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.00089v3-abstract-full" style="display: none;"> Multilingual neural machine translation (NMT) enables training a single model that supports translation from multiple source languages into multiple target languages. In this paper, we push the limits of multilingual NMT in terms of number of languages being used. We perform extensive experiments in training massively multilingual NMT models, translating up to 102 languages to and from English within a single model. We explore different setups for training such models and analyze the trade-offs between translation quality and various modeling decisions. We report results on the publicly available TED talks multilingual corpus where we show that massively multilingual many-to-many models are effective in low resource settings, outperforming the previous state-of-the-art while supporting up to 59 languages. Our experiments on a large-scale dataset with 102 languages to and from English and up to one million examples per direction also show promising results, surpassing strong bilingual baselines and encouraging future work on massively multilingual NMT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.00089v3-abstract-full').style.display = 'none'; document.getElementById('1903.00089v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a long paper in NAACL 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1805.01035">arXiv:1805.01035</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1805.01035">pdf</a>, <a href="https://arxiv.org/format/1805.01035">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Split and Rephrase: Better Evaluation and a Stronger Baseline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Goldberg%2C+Y">Yoav Goldberg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1805.01035v1-abstract-short" style="display: inline;"> Splitting and rephrasing a complex sentence into several shorter sentences that convey the same meaning is a challenging problem in NLP. We show that while vanilla seq2seq models can reach high scores on the proposed benchmark (Narayan et al., 2017), they suffer from memorization of the training set which contains more than 89% of the unique simple sentences from the validation and test sets. To a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.01035v1-abstract-full').style.display = 'inline'; document.getElementById('1805.01035v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1805.01035v1-abstract-full" style="display: none;"> Splitting and rephrasing a complex sentence into several shorter sentences that convey the same meaning is a challenging problem in NLP. We show that while vanilla seq2seq models can reach high scores on the proposed benchmark (Narayan et al., 2017), they suffer from memorization of the training set which contains more than 89% of the unique simple sentences from the validation and test sets. To aid this, we present a new train-development-test data split and neural models augmented with a copy-mechanism, outperforming the best reported baseline by 8.68 BLEU and fostering further progress on the task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.01035v1-abstract-full').style.display = 'none'; document.getElementById('1805.01035v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a short paper in ACL 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1704.04743">arXiv:1704.04743</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1704.04743">pdf</a>, <a href="https://arxiv.org/format/1704.04743">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards String-to-Tree Neural Machine Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Goldberg%2C+Y">Yoav Goldberg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1704.04743v3-abstract-short" style="display: inline;"> We present a simple method to incorporate syntactic information about the target language in a neural machine translation system by translating into linearized, lexicalized constituency trees. An experiment on the WMT16 German-English news translation task resulted in an improved BLEU score when compared to a syntax-agnostic NMT baseline trained on the same dataset. An analysis of the translations&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.04743v3-abstract-full').style.display = 'inline'; document.getElementById('1704.04743v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1704.04743v3-abstract-full" style="display: none;"> We present a simple method to incorporate syntactic information about the target language in a neural machine translation system by translating into linearized, lexicalized constituency trees. An experiment on the WMT16 German-English news translation task resulted in an improved BLEU score when compared to a syntax-agnostic NMT baseline trained on the same dataset. An analysis of the translations from the syntax-aware system shows that it performs more reordering during translation in comparison to the baseline. A small-scale human evaluation also showed an advantage to the syntax-aware system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.04743v3-abstract-full').style.display = 'none'; document.getElementById('1704.04743v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a short paper in ACL 2017</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1611.01487">arXiv:1611.01487</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1611.01487">pdf</a>, <a href="https://arxiv.org/format/1611.01487">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Morphological Inflection Generation with Hard Monotonic Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Goldberg%2C+Y">Yoav Goldberg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1611.01487v3-abstract-short" style="display: inline;"> We present a neural model for morphological inflection generation which employs a hard attention mechanism, inspired by the nearly-monotonic alignment commonly found between the characters in a word and the characters in its inflection. We evaluate the model on three previously studied morphological inflection generation datasets and show that it provides state of the art results in various setups&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1611.01487v3-abstract-full').style.display = 'inline'; document.getElementById('1611.01487v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1611.01487v3-abstract-full" style="display: none;"> We present a neural model for morphological inflection generation which employs a hard attention mechanism, inspired by the nearly-monotonic alignment commonly found between the characters in a word and the characters in its inflection. We evaluate the model on three previously studied morphological inflection generation datasets and show that it provides state of the art results in various setups compared to previous neural and non-neural approaches. Finally we present an analysis of the continuous representations learned by both the hard and soft attention \cite{bahdanauCB14} models for the task, shedding some light on the features such models extract. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1611.01487v3-abstract-full').style.display = 'none'; document.getElementById('1611.01487v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a long paper in ACL 2017</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1305.6164">arXiv:1305.6164</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1305.6164">pdf</a>, <a href="https://arxiv.org/ps/1305.6164">ps</a>, <a href="https://arxiv.org/format/1305.6164">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> On a Generalization of the Ryser-Brualdi-Stein Conjecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Ron Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Charbit%2C+P">Pierre Charbit</a>, <a href="/search/cs?searchtype=author&amp;query=Howard%2C+D">David Howard</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1305.6164v1-abstract-short" style="display: inline;"> A rainbow matching for (not necessarily distinct) sets F_1,...,F_k of hypergraph edges is a matching consisting of k edges, one from each F_i. The aim of the paper is twofold - to put order in the multitude of conjectures that relate to this concept (some of them first presented here), and to present some partial results on one of these conjectures, that seems central among them. </span> <span class="abstract-full has-text-grey-dark mathjax" id="1305.6164v1-abstract-full" style="display: none;"> A rainbow matching for (not necessarily distinct) sets F_1,...,F_k of hypergraph edges is a matching consisting of k edges, one from each F_i. The aim of the paper is twofold - to put order in the multitude of conjectures that relate to this concept (some of them first presented here), and to present some partial results on one of these conjectures, that seems central among them. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1305.6164v1-abstract-full').style.display = 'none'; document.getElementById('1305.6164v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2013; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2013. </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10